Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -180,6 +180,8 @@ return false; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { + if (isa(VL[i])) + continue; Instruction *I = dyn_cast(VL[i]); if (!I) return false; @@ -348,9 +350,18 @@ /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { - // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) + // Make sure these are all Instructions or UndefValues. + if (llvm::any_of(VL, + [](Value *V) { + return !isa(V) && !isa(V); + }) || + llvm::all_of(VL, [](Value *V) { return isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); + for (unsigned I = BaseIndex, E = VL.size(); I < E; I++) + if (isa(VL[I])) { + BaseIndex = I; + break; + } bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); @@ -361,6 +372,8 @@ // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + if (isa(VL[Cnt])) + continue; unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); if (IsBinOp && isa(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) @@ -655,7 +668,8 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices); + int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -807,20 +821,20 @@ ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); - int idx = VectorizableTree.size() - 1; - TreeEntry *Last = &VectorizableTree[idx]; + int Idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[Idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; - if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + for (Value *V : make_filter_range(VL, Instruction::classof)) { + if (Vectorized) { + assert(!getTreeEntry(V) && "Scalar already in tree!"); + ScalarToTreeEntry[V] = Idx; + } else { + MustGather.insert(V); } - } else { - MustGather.insert(VL.begin(), VL.end()); } if (UserTreeIdx.Idx >= 0) @@ -828,7 +842,7 @@ Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); - UserTreeIdx.Idx = idx; + UserTreeIdx.Idx = Idx; } /// -- Vectorization State -- @@ -1457,6 +1471,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; int FoundLane = Lane; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = @@ -1480,7 +1496,8 @@ // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; + Value *UseScalar = + *llvm::find_if(UseEntry->Scalars, Instruction::classof); // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. @@ -1541,9 +1558,9 @@ // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (!isa(V) && EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); newTreeEntry(VL, false, UserTreeIdx); return; @@ -1568,12 +1585,9 @@ } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast(VL[i]); - if (!I) - continue; - if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (!isa(V) && getTreeEntry(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); newTreeEntry(VL, false, UserTreeIdx); return; @@ -1583,8 +1597,9 @@ // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (Value *V : VL) { + if (!isa(V) && + (MustGather.count(V) || is_contained(UserIgnoreList, V))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false, UserTreeIdx); return; @@ -1609,6 +1624,11 @@ SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { + if (isa(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) @@ -1618,13 +1638,20 @@ ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL0->getType())); VL = UniqueValues; } + const unsigned NumberOfInstructions = + llvm::count_if(VL, Instruction::classof); + if (NumberOfInstructions <= 1) { + LLVM_DEBUG( + dbgs() + << "SLP: Gathering due to vectorization of single instruction.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + return; + } + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -1649,11 +1676,11 @@ PHINode *PH = dyn_cast(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - Instruction *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock( - PH->getIncomingBlock(i))); + for (Value *V : InstructionsOnly) + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + auto *Term = + dyn_cast(cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -1666,14 +1693,17 @@ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getIncomingValueForBlock( - PH->getIncomingBlock(i))); + for (Value *V : VL) { + Operands.emplace_back( + isa(V) ? UndefValue::get(V->getType()) + : cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); + } - UserTreeIdx.EdgeIdx = i; + UserTreeIdx.EdgeIdx = I; buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1740,9 +1770,21 @@ // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(NumberOfInstructions); auto POIter = PointerOps.begin(); + bool TypeIsSimple = VL0->getType()->isIntegerTy() || + VL0->getType()->isFloatingPointTy() || + VL0->getType()->isPointerTy(); for (Value *V : VL) { + if (isa(V)) { + if (!TypeIsSimple) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + return; + } + continue; + } auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -1772,7 +1814,11 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && + ((NumberOfInstructions < VL.size() && + Diff->getAPInt().getZExtValue() <= (VL.size() - 1) * Size) || + (NumberOfInstructions == VL.size() && + Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size))) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -1780,6 +1826,8 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { + CurrentOrder.append(VL.size() - NumberOfInstructions, + VL.size() + 1); // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); @@ -1809,8 +1857,10 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (Value *V : VL) { + if (isa(V)) + continue; + Type *Ty = cast(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -1825,8 +1875,11 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back(isa(V) + ? UndefValue::get(SrcTy) + : cast(V)->getOperand(i)); + } UserTreeIdx.EdgeIdx = i; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -1838,8 +1891,10 @@ // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *Cmp = cast(V); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); @@ -1856,8 +1911,12 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } UserTreeIdx.EdgeIdx = i; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -1901,8 +1960,12 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } UserTreeIdx.EdgeIdx = i; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -1911,8 +1974,10 @@ case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (Value *V : VL) { + if (isa(V)) + continue; + if (cast(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -1923,8 +1988,10 @@ // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (Value *V : VL) { + if (isa(V)) + continue; + Type *CurTy = cast(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); @@ -1935,8 +2002,10 @@ } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *Op = cast(V)->getOperand(1); if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); @@ -1951,8 +2020,12 @@ for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } UserTreeIdx.EdgeIdx = i; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -1961,20 +2034,34 @@ } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + for (auto I = InstructionsOnly.begin(), E = InstructionsOnly.end(); + std::next(I) != E; ++I) { + if (!isConsecutiveAccess(*I, *std::next(I), *DL, *SE)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } + } + if (NumberOfInstructions != VL.size() && + !(VL0->getOperand(0)->getType()->isIntegerTy() || + VL0->getOperand(0)->getType()->isFloatingPointTy() || + VL0->getOperand(0)->getType()->isPointerTy())) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return; + } newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(0)); + for (Value *V : VL) { + Operands.push_back(isa(V) + ? UndefValue::get(VL0->getOperand(0)->getType()) + : cast(V)->getOperand(0)); + } UserTreeIdx.EdgeIdx = 0; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -1998,15 +2085,21 @@ for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast(VL[i]); + for (Value *V : VL) { + if (isa(V)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } + CallInst *CI2 = dyn_cast(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] - << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } // Some intrinsics have scalar arguments and should be same in order for @@ -2032,7 +2125,7 @@ BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *V << '\n'); return; } } @@ -2041,8 +2134,8 @@ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); + for (Value *V : VL) { + auto *CI2 = dyn_cast(V); Operands.push_back(CI2->getArgOperand(i)); } UserTreeIdx.EdgeIdx = i; @@ -2076,8 +2169,11 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); UserTreeIdx.EdgeIdx = i; buildTree_rec(Operands, Depth + 1, UserTreeIdx); @@ -2144,9 +2240,6 @@ NElts = Vec->getType()->getVectorNumElements(); } - if (NElts != VL.size()) - return false; - // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; unsigned E = VL.size(); @@ -2156,8 +2249,10 @@ // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. CurrentOrder.assign(E, E + 1); - unsigned I = 0; - for (; I < E; ++I) { + unsigned I = 0, End = std::min(NElts, E); + for (; I < End; ++I) { + if (isa(VL[I])) + continue; auto *Inst = cast(VL[I]); if (Inst->getOperand(0) != Vec) break; @@ -2176,10 +2271,12 @@ CurrentOrder[I] = I; } } - if (I < E) { + if (I < End) { CurrentOrder.clear(); return false; } + if (NElts > E) + return false; return ShouldKeepOrder; } @@ -2194,18 +2291,29 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + const unsigned NumOfInstructions = + llvm::count_if(InstructionsOnly, [](Value *V) { return true; }); + Value *V0; Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + VectorType *VecTy; + if (!llvm::empty(InstructionsOnly)) { + V0 = *InstructionsOnly.begin(); + if (StoreInst *SI = dyn_cast(V0)) + ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + VecTy = VectorType::get(ScalarTy, VL.size()); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(V0)) { + VecTy = VectorType::get( + IntegerType::get(F->getContext(), MinBWs[V0].first), VL.size()); + } + } else { + VecTy = VectorType::get(ScalarTy, VL.size()); + } unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -2223,10 +2331,15 @@ } if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + Optional ShuffleKind = + NumOfInstructions > 1 + ? isShuffle(llvm::to_vector<4>(InstructionsOnly)) + : None; + if (NumOfInstructions == 1 || ShuffleKind) { + int Cost = NumOfInstructions > 1 + ? TTI->getShuffleCost(*ShuffleKind, VecTy) + : 0; + for (Value *V : InstructionsOnly) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the @@ -2235,8 +2348,10 @@ !ScalarToTreeEntry.count(V)) { auto *IO = cast( cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); + Cost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), + IO->getZExtValue()); } } return ReuseShuffleCost + Cost; @@ -2254,16 +2369,19 @@ return 0; case Instruction::ExtractValue: - case Instruction::ExtractElement: + case Instruction::ExtractElement: { if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { + if (isa(VL[I])) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(VL[I])->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, + cast(VL[I])->getVectorOperandType(), Idx); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -2272,53 +2390,77 @@ } Idx = ReuseShuffleNumbers; for (Value *V : VL) { + if (isa(V)) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(V)->getIndexOperand()); Idx = IO->getZExtValue(); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), Idx); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { - int DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { - // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); - // Add back the cost of s|zext which is subtracted separately. - DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); - continue; - } +#ifndef NDEBUG + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + assert(Reuse && E->ReorderIndices.empty() || + (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() && + std::equal(CurrentOrder.begin(), CurrentOrder.end(), + E->ReorderIndices.begin())) && + "The sequence of extract elements must be reused or shuffled " + "with the same mask."); +#endif + int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + if (isa(VL[I])) + continue; + auto *EI = cast(VL[I]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (areAllUsersVectorized(EI)) { + // Take credit for instruction that will become dead. + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + DeadCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, I); + // Add back the cost of s|zext which is subtracted separately. + DeadCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), Ext); + continue; } + } + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast( + cast(EI)->getIndexOperand()); + unsigned Idx = IO->getZExtValue(); + DeadCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(EI)->getVectorOperandType(), Idx); + } else { DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } - return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); + return DeadCost; + } case Instruction::ZExt: case Instruction::SExt: @@ -2336,11 +2478,12 @@ int ScalarEltCost = TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = 0; @@ -2358,10 +2501,11 @@ int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -2399,23 +2543,26 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; + Constant *C0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (isa(VL[i])) + continue; const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { + Constant *CInt = dyn_cast(I->getOperand(1)); + Constant *UV = dyn_cast(I->getOperand(1)); + if (!CInt && !UV) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; break; } if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) + (UV || !cast(CInt)->getValue().isPowerOf2())) Op2VP = TargetTransformInfo::OP_None; if (i == 0) { - CInt0 = CInt; + C0 = CInt ? CInt : UV; continue; } - if (CInt0 != CInt) + if (C0 != (CInt ? CInt : UV)) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } @@ -2423,9 +2570,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost( S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2439,9 +2587,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2452,12 +2601,33 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + } + int ScalarLdCost = NumOfInstructions * ScalarEltCost; + int VecLdCost; + auto FirstInstr = llvm::find_if(VL, Instruction::classof); + auto LastInstr = llvm::find_if(llvm::reverse(VL), Instruction::classof); + unsigned InstrDist = std::distance(FirstInstr, LastInstr.base()); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist); + if (!IsPowOf2NumOfInstructions || InstrDist == 1) { + VecLdCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_load, VecTy, + {VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements()), + VecTy}, FastMathFlags()); + } else if (InstrDist == VL.size()) { + VecLdCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + } else { + VectorType *LoadVecTy = VectorType::get(ScalarTy, InstrDist); + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy, + alignment, 0, VL0) ; } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - if (!E->ReorderIndices.empty()) { + if (!NeedToShuffleReuses && + (!E->ReorderIndices.empty() || + (IsPowOf2NumOfInstructions && InstrDist != VL.size()))) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -2470,11 +2640,22 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + } + int ScalarStCost = NumOfInstructions * ScalarEltCost; + int VecStCost; + if (NumOfInstructions != VL.size()) { + VecStCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_store, Builder.getVoidTy(), + {VecTy, VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements())}, + FastMathFlags()); + } else { + VecStCost = + TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); } - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -2493,9 +2674,10 @@ int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCallCost = NumOfInstructions * ScalarEltCost; SmallVector Args(CI->arg_operands()); int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, @@ -2517,18 +2699,22 @@ int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); + if (isa(VL[Idx])) + continue; + auto *I = cast(VL[Idx]); ReuseShuffleCost -= TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Instruction *I = cast(V); ReuseShuffleCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast(i); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *I = cast(V); assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); @@ -2749,12 +2935,15 @@ return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet &ShuffledIndices) { +int BoUpSLP::getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices) { int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + for (unsigned I = 0, E = cast(Ty)->getNumElements(); I < E; ++I) { + if (IgnoredIndices.count(I)) + continue; + if (!ShuffledIndices.count(I)) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, I); + } if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -2770,14 +2959,19 @@ // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; + DenseSet IgnoredElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; + if (isa(VL[Idx])) { + IgnoredElements.insert(Idx); + continue; + } if (!UniqueElements.insert(VL[Idx]).second) ShuffledElements.insert(Idx); } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, IgnoredElements); } // Return true if the i'th left and right operands can be commuted. @@ -2785,46 +2979,47 @@ // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, ArrayRef Left, +static bool shouldReorderOperands(unsigned PrevInstCnt, Instruction &I, + ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight) { - Value *PrevLeft = Left[i - 1]; - Value *PrevRight = Right[i - 1]; - Value *CurrLeft = Left[i]; - Value *CurrRight = Right[i]; - + Value *VLeft = I.getOperand(0); + Value *VRight = I.getOperand(1); // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { - if (CurrRight == PrevRight) + if (VRight == Right[PrevInstCnt]) // Preserve SplatRight return false; - if (CurrLeft == PrevRight) { + if (VLeft == Right[PrevInstCnt]) { // Commuting would preserve SplatRight, but we don't want to break // SplatLeft either, i.e. preserve the original order if possible. // (FIXME: why do we care?) - if (SplatLeft && CurrLeft == PrevLeft) + if (SplatLeft && VLeft == Left[PrevInstCnt]) return false; return true; } } // Symmetrically handle Right side. if (SplatLeft) { - if (CurrLeft == PrevLeft) + if (VLeft == Left[PrevInstCnt]) // Preserve SplatLeft return false; - if (CurrRight == PrevLeft) + if (VRight == Left[PrevInstCnt]) return true; } - Instruction *ILeft = dyn_cast(CurrLeft); - Instruction *IRight = dyn_cast(CurrRight); + Instruction *ILeft = dyn_cast(VLeft); + Instruction *IRight = dyn_cast(VRight); // If we have "AllSameOpcodeRight", try to see if the left operands preserves // it and not the right, in this case we want to commute. if (AllSameOpcodeRight) { - unsigned RightPrevOpcode = cast(PrevRight)->getOpcode(); + auto PrevRIt = llvm::find_if(Right, Instruction::classof); + if (PrevRIt == Right.end()) + return false; + unsigned RightPrevOpcode = cast(*PrevRIt)->getOpcode(); if (IRight && RightPrevOpcode == IRight->getOpcode()) // Do not commute, a match on the right preserves AllSameOpcodeRight return false; @@ -2833,15 +3028,19 @@ // not also a match on the existing operands on the Left to preserve // AllSameOpcodeLeft, i.e. preserve the original order if possible. // (FIXME: why do we care?) - if (AllSameOpcodeLeft && ILeft && - cast(PrevLeft)->getOpcode() == ILeft->getOpcode()) + auto PrevLIt = llvm::find_if(Left, Instruction::classof); + if (AllSameOpcodeLeft && ILeft && PrevLIt != Left.end() && + cast(*PrevLIt)->getOpcode() == ILeft->getOpcode()) return false; return true; } } // Symmetrically handle Left side. if (AllSameOpcodeLeft) { - unsigned LeftPrevOpcode = cast(PrevLeft)->getOpcode(); + auto PrevLIt = llvm::find_if(Left, Instruction::classof); + if (PrevLIt == Left.end()) + return false; + unsigned LeftPrevOpcode = cast(*PrevLIt)->getOpcode(); if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) return false; if (IRight && LeftPrevOpcode == IRight->getOpcode()) @@ -2859,6 +3058,11 @@ // Push left and right operands of binary operation into Left and Right for (Value *V : VL) { + if (isa(V)) { + Left.push_back(V); + Right.push_back(V); + continue; + } auto *I = cast(V); assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); Left.push_back(I->getOperand(0)); @@ -2866,30 +3070,54 @@ } // Keep track if we have instructions with all the same opcode on one side. - bool AllSameOpcodeLeft = isa(Left[0]); - bool AllSameOpcodeRight = isa(Right[0]); + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; // Keep track if we have one side with all the same value (broadcast). bool SplatLeft = true; bool SplatRight = true; + Instruction *LeftInst = nullptr; + Instruction *RightInst = nullptr; + unsigned PrevInstCnt = 0; + bool PrevSet = false; for (unsigned i = 1, e = VL.size(); i != e; ++i) { + if (isa(VL[i])) + continue; + if (!PrevSet) { + PrevSet = true; + PrevInstCnt = i; + continue; + } Instruction *I = cast(VL[i]); // Commute to favor either a splat or maximizing having the same opcodes on // one side. if (I->isCommutative() && - shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, + shouldReorderOperands(PrevInstCnt, *I, Left, Right, AllSameOpcodeLeft, AllSameOpcodeRight, SplatLeft, SplatRight)) std::swap(Left[i], Right[i]); // Update Splat* and AllSameOpcode* after the insertion. - SplatRight = SplatRight && (Right[i - 1] == Right[i]); - SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); - AllSameOpcodeLeft = AllSameOpcodeLeft && isa(Left[i]) && - (cast(Left[i - 1])->getOpcode() == - cast(Left[i])->getOpcode()); - AllSameOpcodeRight = AllSameOpcodeRight && isa(Right[i]) && - (cast(Right[i - 1])->getOpcode() == - cast(Right[i])->getOpcode()); + SplatRight = SplatRight && (Right[PrevInstCnt] == Right[i]); + SplatLeft = SplatLeft && (Left[PrevInstCnt] == Left[i]); + AllSameOpcodeLeft = + AllSameOpcodeLeft && + (isa(Left[i]) || + (isa(Left[i]) && + (!LeftInst || (LeftInst->getOpcode() == + cast(Left[i])->getOpcode())))); + AllSameOpcodeRight = + AllSameOpcodeRight && + (isa(Right[i]) || + (isa(Right[i]) && + (!RightInst || (RightInst->getOpcode() == + cast(Right[i])->getOpcode())))); + if (!LeftInst) + if (auto *IV = dyn_cast(Left[i])) + LeftInst = IV; + if (!RightInst) + if (auto *IV = dyn_cast(Right[i])) + RightInst = IV; + PrevInstCnt = i; } // If one operand end up being broadcast, return this operand order. @@ -2950,11 +3178,14 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, const InstructionsState &S) { + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + if (llvm::empty(InstructionsOnly)) + return; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. auto *Front = cast(S.OpValue); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { + assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool { auto *I = cast(V); return !S.isOpcodeOrAlt(I) || I->getParent() == BB; })); @@ -2967,8 +3198,8 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + auto *Bundle = BlocksSchedules[BB]->getScheduleData( + isOneOf(S, *llvm::reverse(InstructionsOnly).begin())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -2994,7 +3225,8 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(VL.begin(), VL.end()); + SmallPtrSet Bundle(InstructionsOnly.begin(), + InstructionsOnly.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) LastInst = &I; @@ -3013,6 +3245,8 @@ Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + if (isa(VL[i])) + continue; Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); @@ -3046,27 +3280,27 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); - } - } - return V; + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + DenseMap UniquePositions; + UniqueValues.reserve(VL.size()); + for (Value *V : VL) { + if (isa(V)) { + UniqueValues.emplace_back(V); + continue; } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + if (Res.second) + UniqueValues.emplace_back(V); + } + if (UniqueValues.size() != VL.size()) { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL[0]->getType())); } + if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (E->isSame(UniqueValues)) + return vectorizeTree(E); } Type *ScalarTy = S.OpValue->getType(); @@ -3110,9 +3344,10 @@ SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E); + Mask.resize(E, E - 1); for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + if (Indices[I] < E) + Mask[Indices[I]] = I; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -3129,6 +3364,8 @@ if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + if (isa(VL0)) + return UndefValue::get(VecTy); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3146,9 +3383,9 @@ E->VectorizedValue = V; return V; } - - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3187,72 +3424,44 @@ } case Instruction::ExtractElement: { - if (!E->NeedToGather) { - Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; - return V; + Value *V = E->getSingleOperand(0); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E->Scalars, S); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { - LoadInst *LI = cast(E->getSingleOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + auto *LI = cast(VL0->getOperand(0)); + Builder.SetInsertPoint(LI); + auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + Value *NewV = propagateMetadata(V, llvm::to_vector<4>(InstructionsOnly)); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E->Scalars, S); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -3366,7 +3575,7 @@ static_cast(S.getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3391,8 +3600,21 @@ Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + auto FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + LI->getPointerOperand(), + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the @@ -3402,12 +3624,42 @@ ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecTy, VecPtr); if (!Alignment) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } - LI->setAlignment(Alignment); - Value *V = propagateMetadata(LI, E->Scalars); + Instruction *VecLI; + if (IsPowOf2NumOfInstructions) { + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + VecLI = LI; + } else { + SmallVector Mask; + SmallVector Passthrough; + Mask.reserve(E->Scalars.size()); + Passthrough.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) { + Mask.emplace_back(Builder.getInt1(!isa(V))); + Passthrough.emplace_back(isa(V) + ? cast(V) + : UndefValue::get(LI->getType())); + } + VecLI = Builder.CreateMaskedLoad(VecPtr, Alignment, + ConstantVector::get(Mask), + ConstantVector::get(Passthrough)); + } + Value *V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(E->Scalars.size(), + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto LI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++LI) { + if (!isa(*LI)) + ExtendedIndices[I] = Dist + I; + } + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + ExtendedIndices, "load.extend"); + } if (IsReorder) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -3427,13 +3679,55 @@ StoreInst *SI = cast(VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); + if (!Alignment) + Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); + setInsertPointAfterBundle(E->Scalars, S); Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); - StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + + auto FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(NumOfInstructions, + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto VI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++VI) { + if (!isa(*VI)) + ExtendedIndices[I] = Dist + I; + } + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), ExtendedIndices, + "values.extend"); + } + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + ScalarPtr, + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } + Instruction *VecSI; + if (IsPowOf2NumOfInstructions) { + StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + ST->setAlignment(Alignment); + VecSI = ST; + } else { + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) + Mask.emplace_back(Builder.getInt1(!isa(V))); + VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, Alignment, + ConstantVector::get(Mask)); + } // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -3441,11 +3735,7 @@ if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - - ST->setAlignment(Alignment); - Value *V = propagateMetadata(ST, E->Scalars); + Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -3469,7 +3759,7 @@ Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3572,6 +3862,11 @@ unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { + if (isa(E->Scalars[i])) { + Mask[i] = Builder.getInt32(i); + OpScalars.push_back(E->Scalars[i]); + continue; + } auto *OpInst = cast(E->Scalars[i]); assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == S.getAltOpcode()) { @@ -3589,7 +3884,7 @@ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -3739,6 +4034,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -3862,14 +4159,15 @@ bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (!extendSchedulingRegion(V, S)) return false; } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -3947,7 +4245,9 @@ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || + llvm::count_if(VL, Instruction::classof) == 1) && "tried to unbundle something which is not a bundle"); // Un-bundle: make single instructions out of the bundle. @@ -4252,7 +4552,9 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst) != nullptr && + llvm::count_if(getTreeEntry(SD->Inst)->Scalars, + Instruction::classof) > 1) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4719,15 +5021,25 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned VecRegSize) { - const unsigned ChainLen = Chain.size(); + unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); const unsigned VF = VecRegSize / Sz; - if (!isPowerOf2_32(Sz) || VF < 2) + if (VF < 2 || ChainLen < 2) return false; + SmallVector FixedChain; + if (!isPowerOf2_32(ChainLen)) { + unsigned NewSize = PowerOf2Ceil(ChainLen); + FixedChain.reserve(NewSize); + FixedChain.append(Chain.begin(), Chain.end()); + FixedChain.append(NewSize - Chain.size(), + UndefValue::get(Chain[0]->getType())); + Chain = FixedChain; + ChainLen = NewSize; + } // Keep track of values that were deleted by vectorizing in the loop below. const SmallVector TrackValues(Chain.begin(), Chain.end()); @@ -4900,10 +5212,10 @@ return false; Instruction *I0 = cast(S.OpValue); - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); - if (MaxVF < 2) { + unsigned Pow2VL = PowerOf2Ceil(VL.size()); + unsigned MinVF = 2; + unsigned MaxVF = Pow2VL; + if (MaxVF < 2 || VL.size() < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) << "Cannot SLP vectorize list: vectorization factor " @@ -4937,8 +5249,7 @@ SmallVector TrackValues(VL.begin(), VL.end()); unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; - VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -4953,7 +5264,7 @@ else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (OpsWidth < 2 || (VF > MinVF && OpsWidth <= VF / 2)) break; // Check that a previous iteration of this loop did not delete the Value. @@ -4963,6 +5274,22 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); ArrayRef Ops = VL.slice(I, OpsWidth); + SmallVector FixedChain; + if (UserCost && OpsWidth != VF) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } else if (OpsWidth != VF && !AllowReorder) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } R.buildTree(Ops); Optional> Order = R.bestOrder(); @@ -4982,6 +5309,8 @@ R.computeMinimumValueSizes(); int Cost = R.getTreeCost() - UserCost; + LLVM_DEBUG(dbgs() << "SLP: User vectorization cost: " << -UserCost + << ".\n"); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -4999,6 +5328,10 @@ NextInst = I + 1; Changed = true; } + if (UserCost && VF == MaxVF && I == 0) { + UserCost -= TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } } } @@ -5796,9 +6129,26 @@ Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == VL.size()) { + SmallVector NewOrder(Order->begin(), Order->end()); + SmallVector UsedIndices(Order->size(), false); + unsigned BoundVal = NewOrder.size() + 1; + for (unsigned I : *Order) + if (I != BoundVal) + UsedIndices[I] = true; + unsigned Idx = 0, E = UsedIndices.size(); + for (unsigned &I : NewOrder) { + if (I == BoundVal) { + // Find first non-used index. + for (; Idx != E; ++Idx) + if (!UsedIndices[Idx]) + break; + // Set correct index. + I = Idx; ++Idx; + } + } // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } @@ -5880,6 +6230,15 @@ return VectorizedTree != nullptr; } + SmallVector getExtraArgs() const { + SmallVector Args(ExtraArgs.size()); + auto It = ExtraArgs.begin(); + for (unsigned I = 0, E = ExtraArgs.size(); I < E; ++I, ++It) { + Args[I] = It->second; + } + return Args; + } + unsigned numReductionValues() const { return ReducedVals.size(); } @@ -6015,7 +6374,9 @@ if (isa(V)) break; LastInsertElem = dyn_cast(V); - if (!LastInsertElem || !LastInsertElem->hasOneUse()) + if (!LastInsertElem) + break; + if (!LastInsertElem->hasOneUse()) return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); @@ -6150,6 +6511,15 @@ // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; + // Try to vectorize ExtraArgs. + // Continue analysis for the instruction from the same basic block + // only to save compile time. + if (++Level < RecursionMaxDepth) + for (auto *Op : HorRdx.getExtraArgs()) + if (VisitedInstrs.insert(Op).second) + if (auto *I = dyn_cast(Op)) + if (!isa(I) && I->getParent() == BB) + Stack.emplace_back(Op, Level); continue; } } Index: test/Transforms/SLPVectorizer/AArch64/PR38339.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,15 +3,17 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A:%.*]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: ret void ; %t2 = extractelement <2 x i16> %x, i32 0 @@ -35,15 +37,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] @@ -82,16 +86,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[A]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -235,12 +235,10 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 +; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <4 x i8>*), i32 2, <4 x i1> , <4 x i8> undef) +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -249,16 +247,17 @@ ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 ; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[P1]], i32 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP3]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[TMP5]], i32 2 +; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMP7]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P20:%.*]] = add i32 -5, undef ; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], undef ; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], undef @@ -266,10 +265,10 @@ ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]] ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P27]] +; MAX-COST-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP12]], -5 ; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]] ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -77,22 +77,22 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: ret <4 x i32> [[TMP11]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -123,18 +123,20 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_0]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -198,21 +200,25 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -240,28 +246,28 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i32> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP16]]) -; CHECK-NEXT: ret i32 [[TMP17]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 Index: test/Transforms/SLPVectorizer/X86/PR32086.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR32086.ll +++ test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -6,7 +6,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 @@ -35,13 +36,14 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[REORDER_SHUFFLE]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 @@ -65,7 +67,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 Index: test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR39774.ll +++ test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,8 +7,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <8 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef @@ -81,24 +81,24 @@ ; CHECK-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] ; CHECK-NEXT: [[VAL_42:%.*]] = and i32 [[VAL_40]], undef ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP15]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef @@ -174,13 +174,13 @@ ; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]] ; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_40]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_41]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 14910, i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: Index: test/Transforms/SLPVectorizer/X86/PR40310.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR40310.ll +++ test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,11 +4,11 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <16 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 @@ -40,8 +40,8 @@ ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V43:%.*]] = and i32 undef, [[V42]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> undef, i32 [[V44]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <16 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/addsub.ll +++ test/Transforms/SLPVectorizer/X86/addsub.ll @@ -301,22 +301,22 @@ define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) { ; CHECK-LABEL: @reorder_alt_rightsubTree( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>* -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[D:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = fsub double [[TMP3]], [[TMP4]] +; CHECK-NEXT: store double [[TMP5]], double* [[C:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[D]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load double, double* [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = load double, double* [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd double [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd double [[TMP7]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[C]], i64 1 +; CHECK-NEXT: store double [[TMP13]], double* [[TMP14]] ; CHECK-NEXT: ret void ; %1 = load double, double* %a Index: test/Transforms/SLPVectorizer/X86/alternate-cast.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -8,26 +8,24 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -35,26 +33,24 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SLM-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SLM-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SLM-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SLM-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SLM-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SLM-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -102,26 +98,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -129,26 +123,24 @@ ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @fptosi_fptoui( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SLM-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SLM-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SLM-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SLM-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -156,26 +148,24 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; AVX-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 Index: test/Transforms/SLPVectorizer/X86/alternate-fp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -56,20 +56,35 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> -; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] +; SLM-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP2]], i32 3 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP3]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SLM-NEXT: ret <8 x float> [[R7]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( @@ -125,14 +140,15 @@ ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 +; SLM-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 ; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] Index: test/Transforms/SLPVectorizer/X86/alternate-int.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -83,20 +83,17 @@ ; ; SLM-LABEL: @add_mul_v4i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 ; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 ; SLM-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]] +; SLM-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] ; SLM-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]] ; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP2]], i32 1 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[TMP3]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 ; SLM-NEXT: ret <4 x i32> [[R3]] ; @@ -146,24 +143,62 @@ ; SSE-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] ; SSE-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] ; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; SSE-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; SLM-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; SLM-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R7]] ; -; AVX-LABEL: @ashr_shl_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[R7]] +; AVX1-LABEL: @ashr_shl_v8i32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @ashr_shl_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_shl_v8i32( ; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -208,27 +243,27 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32_const( @@ -274,32 +309,29 @@ ; SSE-LABEL: @ashr_lshr_shl_v8i32( ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 ; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] -; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] ; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -309,15 +341,21 @@ ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; SLM-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SLM-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 @@ -326,10 +364,8 @@ ; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 ; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 ; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[R7]] @@ -337,15 +373,21 @@ ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 @@ -354,10 +396,8 @@ ; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 ; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 ; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] @@ -367,25 +407,22 @@ ; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX2-NEXT: ret <8 x i32> [[R7]] @@ -395,25 +432,22 @@ ; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX512-NEXT: ret <8 x i32> [[R7]] @@ -486,26 +520,74 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @sdiv_v8i32_undefs( +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: ret <8 x i32> undef +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: ret <8 x i32> undef ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -535,15 +617,127 @@ } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B:%.*]] +; SSE-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B]] +; SSE-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; SSE-NEXT: [[AB7:%.*]] = sub i32 [[B]], [[A7]] +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP9]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B:%.*]] +; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B]] +; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B]] +; SLM-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B]], i32 0 +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; SLM-NEXT: [[AB7:%.*]] = sub i32 [[B]], [[A7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP9]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B:%.*]] +; AVX1-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +; AVX1-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +; AVX1-NEXT: [[AB7:%.*]] = sub i32 [[B]], [[A7]] +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP9]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX2-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX2-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <8 x i32> +; AVX2-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B:%.*]] +; AVX2-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B]] +; AVX2-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B]], i32 0 +; AVX2-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = add <8 x i32> [[SHUFFLE]], [[REORDER_SHUFFLE]] +; AVX2-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[SHUFFLE]], [[REORDER_SHUFFLE]] +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX2-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R2]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP3]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R7]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX512-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX512-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[AB0:%.*]] = add i32 [[A0]], [[B:%.*]] +; AVX512-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B]] +; AVX512-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B]] +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[B]], i32 0 +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = add <8 x i32> [[SHUFFLE]], [[REORDER_SHUFFLE]] +; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[SHUFFLE]], [[REORDER_SHUFFLE]] +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX512-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R2]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP3]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 Index: test/Transforms/SLPVectorizer/X86/commutativity.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/commutativity.ll +++ test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -20,24 +20,24 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; CHECK-LABEL: @splat( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[A]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[A]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[A]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[B:%.*]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[A]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[B]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[A]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[A]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[A]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[A]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[A]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[A]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[A]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[A]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[C]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]] ; CHECK-NEXT: store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 ; CHECK-NEXT: ret void @@ -84,20 +84,18 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @same_opcode_on_one_side( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]] -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]] +; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]] +; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]] +; CHECK-NEXT: store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]] +; CHECK-NEXT: store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) ; CHECK-NEXT: ret void ; %add1 = add i32 %c, %a Index: test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,35 +55,32 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -126,14 +126,14 @@ define fastcc void @dct36(double* %inbuf) { ; CHECK-LABEL: @dct36( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8 -; CHECK-NEXT: [[ADD46:%.*]] = fadd double [[TMP0]], undef -; CHECK-NEXT: store double [[ADD46]], double* [[ARRAYIDX41]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[INBUF]], align 8 -; CHECK-NEXT: [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]] -; CHECK-NEXT: store double [[ADD49]], double* [[ARRAYIDX44]], align 8 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[INBUF]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_smallpt.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -31,15 +31,14 @@ ; CHECK: cond.false66.us: ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable Index: test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cse.ll +++ test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,21 +18,16 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[LOAD_EXTEND]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[SHUFFLE]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -131,7 +126,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP3]], 4.000000e+00 ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP14:%.*]], label [[TMP5:%.*]] -; CHECK: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0 @@ -141,7 +137,8 @@ ; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: br label [[TMP24:%.*]] -; CHECK: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP17:%.*]] = load double, double* [[TMP16]], align 8 ; CHECK-NEXT: [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00 @@ -152,7 +149,8 @@ ; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8 ; CHECK-NEXT: br label [[TMP24]] -; CHECK: ret i32 undef +; CHECK: 24: +; CHECK-NEXT: ret i32 undef ; %1 = icmp eq i32 %k, 0 %2 = getelementptr inbounds double, double* %G, i64 5 Index: test/Transforms/SLPVectorizer/X86/extract.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract.ll +++ test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> undef, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/hadd.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/hadd.ll +++ test/Transforms/SLPVectorizer/X86/hadd.ll @@ -202,13 +202,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -322,13 +331,22 @@ ; SSE-NEXT: ret <4 x i64> [[R03]] ; ; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i64> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i64> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i64> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i64> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i64> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i64> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i64> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i64> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = add i64 [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = add i64 [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = add i64 [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x i64> undef, i64 [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x i64> [[R00]], i64 [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x i64> [[R01]], i64 [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x i64> [[R02]], i64 [[R3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[R03]] ; ; AVX-LABEL: @test_v4i64( @@ -432,13 +450,13 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[RV15:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -23,44 +23,62 @@ ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] -; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 -; CHECK-NEXT: ret float [[ADD19_3]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[TMP4]] +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; CHECK-NEXT: store float [[OP_EXTRA3]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_EXTRA3]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[MUL4]], i32 1 +; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; THRESHOLD-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP10:%.*]] = fadd fast <2 x float> [[TMP6]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; THRESHOLD-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP12]], [[TMP13]] +; THRESHOLD-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP15:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> [[TMP15]], [[TMP14]] +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] -; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[ADD19_3]] +; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]] +; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP12]], [[ADD19]] +; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP16]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP17]], [[TMP12]] +; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[MUL4]] +; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; THRESHOLD-NEXT: store float [[OP_EXTRA3]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_EXTRA3]] ; entry: %0 = load i32, i32* @n, align 4 Index: test/Transforms/SLPVectorizer/X86/hsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/hsub.ll +++ test/Transforms/SLPVectorizer/X86/hsub.ll @@ -202,13 +202,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -322,13 +331,22 @@ ; SSE-NEXT: ret <4 x i64> [[R03]] ; ; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i64> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i64> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i64> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i64> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i64> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i64> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i64> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i64> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = sub i64 [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = sub i64 [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = sub i64 [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = sub i64 [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x i64> undef, i64 [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x i64> [[R00]], i64 [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x i64> [[R01]], i64 [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x i64> [[R02]], i64 [[R3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[R03]] ; ; AVX-LABEL: @test_v4i64( @@ -432,13 +450,13 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[RV15:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -291,42 +291,33 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> ; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 ; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3 ; CHECK-NEXT: ret <4 x float> [[RD]] ; ; ZEROTHRESH-LABEL: @simple_select_no_users( Index: test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/operandorder.ll +++ test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -10,10 +10,13 @@ define void @shuffle_operands1(double * noalias %from, double * noalias %to, ; CHECK-LABEL: @shuffle_operands1( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 +; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 +; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[V2:%.*]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[V1:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V0_2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4 @@ -119,14 +122,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -202,16 +202,15 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -292,32 +291,24 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] -; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995 +; CHECK-NEXT: [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -372,13 +363,19 @@ define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){ ; CHECK-LABEL: @load_reorder_double( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[A]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = load double, double* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %1 = load double, double* %a Index: test/Transforms/SLPVectorizer/X86/partail.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/partail.ll +++ test/Transforms/SLPVectorizer/X86/partail.ll @@ -18,26 +18,23 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef -; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]] ; CHECK-NEXT: unreachable ; entry: Index: test/Transforms/SLPVectorizer/X86/phi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/phi.ll +++ test/Transforms/SLPVectorizer/X86/phi.ll @@ -140,49 +140,52 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[REORDER_SHUFFLE]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = phi <8 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], -; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x float> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = fadd <8 x float> [[TMP2]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP18]], 121 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x float> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> undef, float [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x float> [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP21]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x float> [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x float> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[TMP17]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 4 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP8]], i32 5 +; CHECK-NEXT: [[TMP30]] = insertelement <8 x float> [[TMP29]], float [[TMP10]], i32 6 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[TMP27]], [[TMP25]] +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP23]] +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP21]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: Index: test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -11,33 +11,31 @@ ; CHECK-NEXT: ret void ; CHECK: bb2: ; CHECK-NEXT: [[TMP:%.*]] = select i1 undef, i16 undef, i16 15 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[TMP]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[REORDER_SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], undef +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE8]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef ; CHECK-NEXT: [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], undef -; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP13:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT11]], [[RDX_SHUF12]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT14:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP13]], <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> [[RDX_SHUF12]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef -; CHECK-NEXT: [[TMP19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef +; CHECK-NEXT: [[TMP19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], 63 -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef @@ -53,23 +51,23 @@ ; CHECK-NEXT: [[TMP41:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], [[TMP39]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP9]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP9]], <4 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP8]], <4 x i32> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef -; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA4]], i32 undef -; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA5]], i32 undef -; CHECK-NEXT: [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA6]], i32 undef +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt i32 [[TMP9]], undef +; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 undef +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 undef +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA4]], i32 undef +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef +; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA5]], i32 undef +; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef +; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA6]], i32 undef ; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP39]], i32 [[TMP42]] ; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]] ; CHECK-NEXT: unreachable Index: test/Transforms/SLPVectorizer/X86/resched.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/resched.ll +++ test/Transforms/SLPVectorizer/X86/resched.ll @@ -32,45 +32,43 @@ ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 -; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = lshr <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP15]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 5 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 6 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 7 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 9 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 10 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP11]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP33]], i32 11 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP35]], i32 12 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[TMP11]], i32 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP37]], i32 13 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP11]], i32 5 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP39]], i32 14 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP11]], i32 6 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP41]], i32 15 ; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> ; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 Index: test/Transforms/SLPVectorizer/X86/rgb_phi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -25,39 +25,37 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 ; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 -; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE1]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: Index: test/Transforms/SLPVectorizer/X86/schedule-bundle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -10,18 +10,10 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-LABEL: @slp_schedule_bundle( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> ) ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/sext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/sext.ll +++ test/Transforms/SLPVectorizer/X86/sext.ll @@ -166,17 +166,18 @@ ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = sext <2 x i8> [[TMP4]] to <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 @@ -301,27 +302,26 @@ ; AVX1-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; AVX1-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; AVX1-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; AVX1-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; AVX1-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; AVX1-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; AVX1-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; AVX1-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX1-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX1-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX1-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; AVX1-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; AVX1-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; AVX1-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[P4]] to <4 x i8>* +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; AVX1-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP4]] to <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; AVX1-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; AVX1-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP10]], i32 3 +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; AVX1-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP11]], i32 4 +; AVX1-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 +; AVX1-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP12]], i32 5 +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; AVX1-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP13]], i32 6 +; AVX1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; AVX1-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP14]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[V7]] ; ; AVX2-LABEL: @loadext_8i8_to_8i32( @@ -661,17 +661,18 @@ ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 @@ -845,17 +846,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64( Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[MUL18]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: unreachable ; entry: Index: test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,10 +7,11 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 @@ -81,12 +82,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A3:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 @@ -159,12 +161,13 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A3:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A4:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A1:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 Index: test/Transforms/SLPVectorizer/X86/zext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/zext.ll +++ test/Transforms/SLPVectorizer/X86/zext.ll @@ -131,17 +131,18 @@ ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 @@ -535,17 +536,18 @@ ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 @@ -719,17 +721,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64(