diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3854,13 +3854,32 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); TE->setOperandsInOrder(); - for (unsigned i = 0, e = 2; i < e; ++i) { + for (unsigned I = 0, E = 2; I < E; ++I) { ValueList Operands; - // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + if (I >= 1) { + // Need to cast all elements to the same type before vectorization to + // avoid crash. + Type *VL0Ty = VL0->getOperand(I)->getType(); + Type *Ty = + all_of(VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) + ? VL0Ty + : DL->getIndexType(cast(VL0) + ->getPointerOperandType() + ->getScalarType()); + // Prepare the operand vector. + for (Value *V : VL) { + auto *Op = cast(V)->getOperand(I); + auto *CI = cast(Op); + Operands.push_back(ConstantExpr::getIntegerCast( + CI, Ty, CI->getValue().isSignBitSet())); + } + } else { + // Prepare the operand vector. + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(I)); + } - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, I}); } return; } @@ -5456,6 +5475,12 @@ } } + if (UsedTEs.empty()) { + assert(all_of(TE->Scalars, UndefValue::classof) && + "Expected vector of undefs only."); + return None; + } + unsigned VF = 0; if (UsedTEs.size() == 1) { // Try to find the perfect match in another gather node at first. @@ -8021,15 +8046,14 @@ using ReductionOpsType = SmallVector; using ReductionOpsListType = SmallVector; ReductionOpsListType ReductionOps; - SmallVector ReducedVals; + SmallVector> ReducedVals; + DenseMap ReducedValsToOps; // Use map vector to make stable output. MapVector ExtraArgs; WeakTrackingVH ReductionRoot; /// The type of reduction operation. RecurKind RdxKind; - const unsigned INVALID_OPERAND_INDEX = std::numeric_limits::max(); - static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); @@ -8073,26 +8097,6 @@ return I->getOperand(Index); } - /// Checks if the ParentStackElem.first should be marked as a reduction - /// operation with an extra argument or as extra argument itself. - void markExtraArg(std::pair &ParentStackElem, - Value *ExtraArg) { - if (ExtraArgs.count(ParentStackElem.first)) { - ExtraArgs[ParentStackElem.first] = nullptr; - // We ran into something like: - // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. - // The whole ParentStackElem.first should be considered as an extra value - // in this case. - // Do not perform analysis of remaining operands of ParentStackElem.first - // instruction, this whole instruction is an extra argument. - ParentStackElem.second = INVALID_OPERAND_INDEX; - } else { - // We ran into something like: - // ParentStackElem.first += ... + ExtraArg + ... - ExtraArgs[ParentStackElem.first] = ExtraArg; - } - } - /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, Value *RHS, const Twine &Name, bool UseSelect) { @@ -8177,7 +8181,7 @@ /// Creates reduction operation with the current opcode with the IR flags /// from \p I. static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, - Value *RHS, const Twine &Name, Instruction *I) { + Value *RHS, const Twine &Name, Value *I) { auto *SelI = dyn_cast(I); Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { @@ -8188,8 +8192,10 @@ return Op; } - static RecurKind getRdxKind(Instruction *I) { - assert(I && "Expected instruction for reduction matching"); + static RecurKind getRdxKind(Value *V) { + auto *I = dyn_cast(V); + if (!I) + return RecurKind::None; TargetTransformInfo::ReductionFlags RdxFlags; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; @@ -8353,7 +8359,8 @@ HorizontalReduction() = default; /// Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { + bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst, + ScalarEvolution &SE, const DataLayout &DL) { assert((!Phi || is_contained(Phi->operands(), Inst)) && "Phi needs to use the binary operator"); assert((isa(Inst) || isa(Inst) || @@ -8397,87 +8404,129 @@ ReductionRoot = Inst; - // The opcode for leaf values that we perform a reduction on. - // For example: load(x) + load(y) + load(z) + fptoui(w) - // The leaf opcode for 'w' does not match, so we don't include it as a - // potential candidate for the reduction. - unsigned LeafOpcode = 0; - - // Post-order traverse the reduction tree starting at Inst. We only handle - // true trees containing binary operators or selects. - SmallVector, 32> Stack; - Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); - initReductionOps(Inst); - while (!Stack.empty()) { - Instruction *TreeN = Stack.back().first; - unsigned EdgeToVisit = Stack.back().second++; - const RecurKind TreeRdxKind = getRdxKind(TreeN); - bool IsReducedValue = TreeRdxKind != RdxKind; - - // Postorder visit. - if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { - if (IsReducedValue) - ReducedVals.push_back(TreeN); - else { - auto ExtraArgsIter = ExtraArgs.find(TreeN); - if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { - // Check if TreeN is an extra argument of its parent operation. - if (Stack.size() <= 1) { - // TreeN can't be an extra argument as it is a root reduction - // operation. - return false; - } - // Yes, TreeN is an extra argument, do not add it to a list of - // reduction operations. - // Stack[Stack.size() - 2] always points to the parent operation. - markExtraArg(Stack[Stack.size() - 2], TreeN); - ExtraArgs.erase(TreeN); - } else - addReductionOps(TreeN); + // Iterate through all the operands of the possible reduction tree and + // gather all the reduced values, sorting them by their value id. + BasicBlock *BB = Inst->getParent(); + bool IsCmpSelMinMax = isCmpSelMinMax(Inst); + std::queue Worklist; + Worklist.push(Inst); + auto &&CheckOperands = [this, IsCmpSelMinMax, + BB](Instruction *TreeN, + SmallVectorImpl &ExtraArgs, + SmallVectorImpl &PossibleReducedVals, + SmallVectorImpl &ReductionOps) { + for (int I = getFirstOperandIndex(TreeN), + End = getNumberOfOperands(TreeN); + I < End; ++I) { + Value *EdgeVal = getRdxOperand(TreeN, I); + ReducedValsToOps.try_emplace(EdgeVal, TreeN); + auto *EdgeInst = dyn_cast(EdgeVal); + // Edge has wrong parent - mark as an extra argument. + if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) && + !hasSameParent(EdgeInst, BB)) { + ExtraArgs.push_back(EdgeVal); + continue; } - // Retract. - Stack.pop_back(); - continue; - } - - // Visit operands. - Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); - auto *EdgeInst = dyn_cast(EdgeVal); - if (!EdgeInst) { - // Edge value is not a reduction instruction or a leaf instruction. - // (It may be a constant, function argument, or something else.) - markExtraArg(Stack.back(), EdgeVal); - continue; + // If the edge is not an instruction, or it is different from main + // reduction opcode or has too many uses - possible reduced value. + if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || + !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || + !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { + PossibleReducedVals.push_back(EdgeVal); + continue; + } + ReductionOps.push_back(EdgeInst); } - RecurKind EdgeRdxKind = getRdxKind(EdgeInst); - // Continue analysis if the next operand is a reduction operation or - // (possibly) a leaf value. If the leaf value opcode is not set, - // the first met operation != reduction operation is considered as the - // leaf opcode. - // Only handle trees in the current basic block. - // Each tree node needs to have minimal number of users except for the - // ultimate reduction. - const bool IsRdxInst = EdgeRdxKind == RdxKind; - if (EdgeInst != Phi && EdgeInst != Inst && - hasSameParent(EdgeInst, Inst->getParent()) && - hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && - (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { - if (IsRdxInst) { - // We need to be able to reassociate the reduction operations. - if (!isVectorizable(EdgeRdxKind, EdgeInst)) { - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); - continue; + }; + MapVector>> + PossibleReducedVals; + initReductionOps(Inst); + while (!Worklist.empty()) { + Instruction *TreeN = Worklist.front(); + Worklist.pop(); + SmallVector Args; + SmallVector PossibleRedVals; + SmallVector PossibleReductionOps; + CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps); + // If too many extra args - mark the instruction itself as a reduction + // value, not a reduction operation. + if (Args.size() < 2) { + addReductionOps(TreeN); + // Add extra args. + for (Value *V : Args) + ExtraArgs[TreeN] = V; + // Add reduction values. The values are sorted for better vetorization + // results. + for (Value *V : PossibleRedVals) { + unsigned Key = V->getValueID() + 1; + // Sort the loads by the distance between the pointers. + if (auto *LI = dyn_cast(V)) { + bool Found = false; + for (const auto &LoadData : PossibleReducedVals[Key]) { + auto *RLI = cast(LoadData.second.front()); + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), DL, + SE, /*StrictCheck=*/true)) { + PossibleReducedVals[Key][reinterpret_cast( + RLI->getPointerOperand())] + .push_back(V); + Found = true; + break; + } + } + if (!Found) + PossibleReducedVals[Key][reinterpret_cast( + LI->getPointerOperand())] + .push_back(V); + } else if (auto *EI = dyn_cast(V)) { + // Sort extracts by the vector operands. + PossibleReducedVals[Key][reinterpret_cast( + EI->getVectorOperand())] + .push_back(V); + } else if (auto *I = dyn_cast(V)) { + // Sort other instructions just by the opcodes except for CMPInst. + // For CMP also sort by the predicate kind. + if (isValidForAlternation(I->getOpcode()) && !isa(I)) + PossibleReducedVals[0][I->getOpcode()].push_back(V); + else if (auto *CI = dyn_cast(I)) + PossibleReducedVals[Key] + [hash_combine(hash_value(I->getOpcode()), + hash_value(CI->getPredicate()))] + .push_back(V); + else + PossibleReducedVals[Key][I->getOpcode()].push_back(V); + } else { + PossibleReducedVals[Key][0].push_back(V); } - } else if (!LeafOpcode) { - LeafOpcode = EdgeInst->getOpcode(); } - Stack.push_back( - std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); - continue; + for (Instruction *I : PossibleReductionOps) + Worklist.push(I); + } else { + PossibleReducedVals[0][TreeN->getOpcode()].push_back(TreeN); } - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); + } + auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); + // Sort the reduced values by number of same/alternate opcode and/or pointer + // operand. + auto Cmp = [](ArrayRef P1, ArrayRef P2) { + return P1.size() < P2.size(); + }; + std::priority_queue, SmallVector>, + decltype(Cmp)> + OrderedVals(Cmp); + // Sort values by total number of values kinds. + for (auto &PossibleReducedVals : PossibleReducedValsVect) { + auto PossibleRedVals = PossibleReducedVals.second.takeVector(); + stable_sort(PossibleRedVals, [](const auto &P1, const auto &P2) { + return P1.second.size() > P2.second.size(); + }); + for (auto &Data : PossibleRedVals) + OrderedVals.emplace(Data.second); + } + while (!OrderedVals.empty()) { + ReducedVals.emplace_back(OrderedVals.top().rbegin(), + OrderedVals.top().rend()); + OrderedVals.pop(); } return true; } @@ -8487,34 +8536,29 @@ // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - unsigned NumReducedVals = ReducedVals.size(); + unsigned NumReducedVals = std::accumulate( + ReducedVals.begin(), ReducedVals.end(), 0, + [](int Num, ArrayRef Vals) { return Num += Vals.size(); }); if (NumReducedVals < 4) return nullptr; - // Intersect the fast-math-flags from all reduction operations. - FastMathFlags RdxFMF; - RdxFMF.set(); - for (ReductionOpsType &RdxOp : ReductionOps) { - for (Value *RdxVal : RdxOp) { - if (auto *FPMO = dyn_cast(RdxVal)) - RdxFMF &= FPMO->getFastMathFlags(); - } - } - IRBuilder<> Builder(cast(ReductionRoot)); - Builder.setFastMathFlags(RdxFMF); + // Track the reduced values in case if they are replaced by extractelement + // because of the vectorization. + DenseMap TrackedVals; BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; // The same extra argument may be used several times, so log each attempt // to use it. for (const std::pair &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); + TrackedVals.try_emplace(Pair.second, Pair.second); } // The compare instruction of a min/max is the insertion point for new // instructions and may be replaced with a new compare instruction. - auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { assert(isa(RdxRootInst) && "Expected min/max reduction to have select root instruction"); Value *ScalarCond = cast(RdxRootInst)->getCondition(); @@ -8526,143 +8570,280 @@ // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; - SmallVector IgnoreList; - for (ReductionOpsType &RdxOp : ReductionOps) - IgnoreList.append(RdxOp.begin(), RdxOp.end()); - - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - if (NumReducedVals > ReduxWidth) { - // In the loop below, we are building a tree based on a window of - // 'ReduxWidth' values. - // If the operands of those values have common traits (compare predicate, - // constant operand, etc), then we want to group those together to - // minimize the cost of the reduction. - - // TODO: This should be extended to count common operands for - // compares and binops. - - // Step 1: Count the number of times each compare predicate occurs. - SmallDenseMap PredCountMap; - for (Value *RdxVal : ReducedVals) { - CmpInst::Predicate Pred; - if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) - ++PredCountMap[Pred]; - } - // Step 2: Sort the values so the most common predicates come first. - stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { - CmpInst::Predicate PredA, PredB; - if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && - match(B, m_Cmp(PredB, m_Value(), m_Value()))) { - return PredCountMap[PredA] > PredCountMap[PredB]; - } - return false; - }); - } + SmallVector IgnoreList; + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) { + if (!RdxOp) + continue; + IgnoreList.push_back(RdxOp); + } + // Need to track reduced vals, they may be changed during vectorization of + // subvectors. + for (ArrayRef Candidates : ReducedVals) + for (Value *V : Candidates) + TrackedVals.try_emplace(V, V); + + DenseMap VectorizedVals; Value *VectorizedTree = nullptr; - unsigned i = 0; - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - ArrayRef VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, IgnoreList); - if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) - break; - if (V.isLoadCombineReductionCandidate(RdxKind)) - break; - V.reorderTopToBottom(); - V.reorderBottomToTop(/*IgnoreReorder=*/true); - V.buildExternalUses(ExternallyUsedValues); - - // For a poison-safe boolean logic reduction, do not replace select - // instructions with logic ops. All reduced values will be frozen (see - // below) to prevent leaking poison. - if (isa(ReductionRoot) && - isBoolLogicOp(cast(ReductionRoot)) && - NumReducedVals != ReduxWidth) - break; + // Try to vectorize elements base on their type. + for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { + ArrayRef OrigReducedVals = ReducedVals[I]; + InstructionsState S = getSameOpcode(OrigReducedVals); + SmallVector Candidates; + DenseMap TrackedToOrig; + for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { + Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; + // Check if the reduction value was not overriden by the extractelement + // instruction because of the vectorization and exclude it, if it is not + // compatible with other values. + if (auto *Inst = dyn_cast(RdxVal)) + if (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)) + continue; + Candidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); + } + bool ShuffledExtracts = false; + // Try to handle shuffled extractelements. + if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && + I + 1 < E) { + InstructionsState S = getSameOpcode(ReducedVals[I + 1]); + if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle()) { + SmallVector CommonCandidates(Candidates); + for (unsigned Cnt = 0, Sz = ReducedVals[I + 1].size(); Cnt < Sz; + ++Cnt) { + Value *RdxVal = TrackedVals.find(ReducedVals[I + 1][Cnt])->second; + // Check if the reduction value was not overriden by the + // extractelement instruction because of the vectorization and + // exclude it, if it is not compatible with other values. + if (auto *Inst = dyn_cast(RdxVal)) + if (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)) + continue; + CommonCandidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, ReducedVals[I + 1][Cnt]); + } + SmallVector Mask; + if (isFixedVectorShuffle(CommonCandidates, Mask)) { + ++I; + Candidates.swap(CommonCandidates); + ShuffledExtracts = true; + } + } + } + unsigned NumReducedVals = Candidates.size(); + if (NumReducedVals < 4) + continue; - V.computeMinimumValueSizes(); + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + unsigned Start = 0; + unsigned i = Start; + // Restarts vectorization attempt with lower vector factor. + auto &&AdjustReducedVals = [&i, &Start, &ReduxWidth, NumReducedVals]() { + if (ReduxWidth == 4 || i >= NumReducedVals - ReduxWidth + 1) { + ++Start; + ReduxWidth = PowerOf2Floor(NumReducedVals - Start) * 2; + } + i = Start; + ReduxWidth /= 2; + }; + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= 4) { + ArrayRef VL(std::next(Candidates.begin(), i), ReduxWidth); + V.buildTree(VL, IgnoreList); + if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { + AdjustReducedVals(); + continue; + } + if (V.isLoadCombineReductionCandidate(RdxKind)) { + AdjustReducedVals(); + continue; + } + V.reorderTopToBottom(); + // No need to reorder the root node at all. + V.reorderBottomToTop(/*IgnoreReorder=*/true); + // Keep extracted other reduction values, if they are used in the + // vectorization trees. + BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( + ExternallyUsedValues); + for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { + if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) + continue; + for_each(ReducedVals[Cnt], + [&LocalExternallyUsedValues, &TrackedVals](Value *V) { + if (isa(V)) + LocalExternallyUsedValues[TrackedVals[V]]; + }); + } + for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { + if (Cnt >= i && Cnt < i + ReduxWidth) + continue; + if (VectorizedVals.count(Candidates[Cnt])) + continue; + LocalExternallyUsedValues[Candidates[Cnt]]; + } + V.buildExternalUses(LocalExternallyUsedValues); - // Estimate cost. - InstructionCost TreeCost = - V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); - InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); - InstructionCost Cost = TreeCost + ReductionCost; - if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return nullptr; - } - if (Cost >= -SLPCostThreshold) { + V.computeMinimumValueSizes(); + + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (Value *RdxVal : VL) { + if (auto *FPMO = dyn_cast( + ReducedValsToOps.find(RdxVal)->second)) + RdxFMF &= FPMO->getFastMathFlags(); + } + // Estimate cost. + InstructionCost TreeCost = V.getTreeCost(VL); + InstructionCost ReductionCost = + getReductionCost(TTI, VL, ReduxWidth, RdxFMF); + InstructionCost Cost = TreeCost + ReductionCost; + if (!Cost.isValid()) { + LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + return nullptr; + } + if (Cost >= -SLPCostThreshold) { + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + cast(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + AdjustReducedVals(); + continue; + } + + LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" + << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", - cast(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " << ore::NV("Cost", Cost) - << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + cast(VL[0])) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); - break; - } - LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" - << Cost << ". (HorRdx)\n"); - V.getORE()->emit([&]() { - return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", - cast(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); - }); + Builder.setFastMathFlags(RdxFMF); - // Vectorize a tree. - DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); + // Vectorize a tree. + Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues); - // Emit a reduction. If the root is a select (min/max idiom), the insert - // point is the compare condition of that select. - Instruction *RdxRootInst = cast(ReductionRoot); - if (isCmpSelMinMax(RdxRootInst)) - Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); - else - Builder.SetInsertPoint(RdxRootInst); + // Emit a reduction. If the root is a select (min/max idiom), the insert + // point is the compare condition of that select. + Instruction *RdxRootInst = cast(ReductionRoot); + if (isCmpSelMinMax(RdxRootInst)) + Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); - // To prevent poison from leaking across what used to be sequential, safe, - // scalar boolean logic operations, the reduction operand must be frozen. - if (isa(RdxRootInst) && isBoolLogicOp(RdxRootInst)) - VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // To prevent poison from leaking across what used to be sequential, + // safe, scalar boolean logic operations, the reduction operand must be + // frozen. + if (isa(RdxRootInst) && isBoolLogicOp(RdxRootInst)) + VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); - Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + Value *ReducedSubTree = + emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (!VectorizedTree) { - // Initialize the final value in the reduction. - VectorizedTree = ReducedSubTree; - } else { - // Update the final value in the reduction. - Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - ReducedSubTree, "op.rdx", ReductionOps); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + ReducedSubTree, "op.rdx", ReductionOps); + } + // Count vectorized reduced values to exclude them from final reduction. + for (Value *V : VL) + ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) + .first->getSecond(); + i += ReduxWidth; + Start = i; + ReduxWidth = PowerOf2Floor(NumReducedVals - i); } - i += ReduxWidth; - ReduxWidth = PowerOf2Floor(NumReducedVals - i); } - if (VectorizedTree) { - // Finish the reduction. - for (; i < NumReducedVals; ++i) { - auto *I = cast(ReducedVals[i]); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = - createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); - } - for (auto &Pair : ExternallyUsedValues) { - // Add each externally used value to the final reduction. - for (auto *I : Pair.second) { - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - Pair.first, "op.extra", I); + // Need to add extra arguments and not vectorized possible reduction + // values. + // Try to avoid dependencies between the scalar reductions. + auto &&FinalGen = + [this, &Builder, + &TrackedVals](ArrayRef> InstVals) { + unsigned Sz = InstVals.size(); + SmallVector> ExtraReds(Sz / 2 + + Sz % 2); + for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { + Instruction *RedOp = InstVals[I + 1].first; + Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); + ReductionOpsListType Ops; + if (auto *Sel = dyn_cast(RedOp)) + Ops.emplace_back().push_back(Sel->getCondition()); + Ops.emplace_back().push_back(RedOp); + Value *RdxVal1 = InstVals[I].second; + Value *StableRdxVal1 = RdxVal1; + auto It1 = TrackedVals.find(RdxVal1); + if (It1 != TrackedVals.end()) + StableRdxVal1 = It1->second; + Value *RdxVal2 = InstVals[I + 1].second; + Value *StableRdxVal2 = RdxVal2; + auto It2 = TrackedVals.find(RdxVal2); + if (It2 != TrackedVals.end()) + StableRdxVal2 = It2->second; + Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, + StableRdxVal2, "op.rdx", Ops); + ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); + } + if (Sz % 2 == 1) + ExtraReds[Sz / 2] = InstVals.back(); + return ExtraReds; + }; + SmallVector> ExtraReductions; + // Final reduction of not vectorized reduced values. + for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { + ArrayRef Candidates = ReducedVals[I]; + for (unsigned Cnt = 0, NumReducedVals = Candidates.size(); + Cnt < NumReducedVals; ++Cnt) { + Value *RdxVal = Candidates[Cnt]; + auto It = VectorizedVals.find(RdxVal); + if (It != VectorizedVals.end()) { + --It->getSecond(); + if (It->second == 0) + VectorizedVals.erase(It); + continue; + } + Instruction *RedOp = ReducedValsToOps.find(RdxVal)->second; + ExtraReductions.emplace_back(RedOp, RdxVal); } } + for (const auto &Pair : ExtraArgs) { + // Add each externally used value to the final reduction. + ExtraReductions.emplace_back(Pair.first, Pair.second); + } + // Iterate through all not-vectorized reduction values/extra arguments. + while (ExtraReductions.size() > 1) { + SmallVector> NewReds = + FinalGen(ExtraReductions); + ExtraReductions.swap(NewReds); + } + // Final reduction. + if (ExtraReductions.size() == 1) { + Instruction *RedOp = ExtraReductions.back().first; + Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); + ReductionOpsListType Ops; + if (auto *Sel = dyn_cast(RedOp)) + Ops.emplace_back().push_back(Sel->getCondition()); + Ops.emplace_back().push_back(RedOp); + Value *RdxVal = ExtraReductions.back().second; + Value *StableRdxVal = RdxVal; + auto It = TrackedVals.find(RdxVal); + if (It != TrackedVals.end()) + StableRdxVal = It->second; + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + StableRdxVal, "op.rdx", Ops); + } ReductionRoot->replaceAllUsesWith(VectorizedTree); @@ -8678,12 +8859,16 @@ private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, unsigned ReduxWidth, - FastMathFlags FMF) { + ArrayRef ReducedVals, + unsigned ReduxWidth, FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Value *FirstReducedVal = ReducedVals.front(); Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); - InstructionCost VectorCost, ScalarCost; + InstructionCost VectorCost = 0, ScalarCost; + // If all of the reduced values are constant, the vector cost is 0, since + // the reduction value can be calculated at the compile time. + bool AllConsts = all_of(ReducedVals, isConstant); switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -8693,17 +8878,21 @@ case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = - TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + if (!AllConsts) + VectorCost = + TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); break; } case RecurKind::FMax: case RecurKind::FMin: { auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*unsigned=*/false, CostKind); + if (!AllConsts) { + auto *VecCondTy = + cast(CmpInst::makeCmpResultType(VectorTy)); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*unsigned=*/false, CostKind); + } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -8716,11 +8905,14 @@ case RecurKind::UMax: case RecurKind::UMin: { auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); - bool IsUnsigned = - RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, - CostKind); + if (!AllConsts) { + auto *VecCondTy = + cast(CmpInst::makeCmpResultType(VectorTy)); + bool IsUnsigned = + RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + IsUnsigned, CostKind); + } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -8935,7 +9127,7 @@ /// performed. static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, + TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL, const function_ref Vectorize) { if (!ShouldVectorizeHor) return false; @@ -8962,13 +9154,13 @@ SmallPtrSet VisitedInstrs; SmallVector PostponedInsts; bool Res = false; - auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [TTI, &SE, &DL, &P, &R](Instruction *Inst, Value *&B0, + Value *&B1) -> Value * { bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst)) + if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL)) return HorRdx.tryToReduce(R, TTI); } return nullptr; @@ -9051,7 +9243,7 @@ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { return tryToVectorize(I, R); }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL, ExtraVectorization); } diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -90,24 +90,16 @@ define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[T]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[T]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0 -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]] -; CHECK-NEXT: [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]] -; CHECK-NEXT: [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]] -; CHECK-NEXT: [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] +; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x float> [[T:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR7]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR7]], +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR7]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR7]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -420,25 +412,17 @@ define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[T:%.*]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[T]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[T]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[T]], i32 0 -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[T_FR]], -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP3]], 255 -; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP11]] -; CHECK-NEXT: [[CMP14:%.*]] = icmp sgt i32 [[TMP2]], 255 -; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP14]] -; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[TMP1]], 255 -; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP17]] -; CHECK-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[TMP0]], 255 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP20]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x i32> [[T:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR7]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[T_FR7]], +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR7]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]] +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -41,7 +41,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -15,10 +15,10 @@ ; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; DEFAULT: for.body: -; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> ; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] +; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]] ; DEFAULT-NEXT: br label [[FOR_BODY]] ; ; GATHER-LABEL: @PR28330( @@ -27,49 +27,27 @@ ; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> ; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] +; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 -; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 -; MAX-COST-NEXT: [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 -; MAX-COST-NEXT: [[P3:%.*]] = icmp eq i8 [[P2]], 0 -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 -; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 -; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer +; MAX-COST-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5) to <4 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[TMP2]], zeroinitializer ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P20:%.*]] = add i32 [[P17]], [[P19]] -; MAX-COST-NEXT: [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], [[P21]] -; MAX-COST-NEXT: [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], [[P23]] -; MAX-COST-NEXT: [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P26:%.*]] = add i32 [[P24]], [[P25]] -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]] -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]] -; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[P30]], [[P31]] -; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; MAX-COST-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; MAX-COST-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP7]] +; MAX-COST-NEXT: [[OP_RDX1]] = add i32 [[OP_RDX]], [[P17]] ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: @@ -119,10 +97,10 @@ ; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; DEFAULT: for.body: -; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> ; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 +; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5 ; DEFAULT-NEXT: br label [[FOR_BODY]] ; ; GATHER-LABEL: @PR32038( @@ -131,38 +109,27 @@ ; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> ; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 +; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5 ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1 ; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 -; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 -; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5) to <4 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[TMP2]], zeroinitializer ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) -; MAX-COST-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]] -; MAX-COST-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5 -; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] -; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; MAX-COST-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; MAX-COST-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP7]] +; MAX-COST-NEXT: [[OP_RDX1]] = add i32 [[OP_RDX]], -5 ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -221,12 +221,12 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -221,12 +221,12 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll @@ -22,11 +22,11 @@ ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]] ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) -; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]] +; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] +; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP5]], i32 [[SELECT1]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA1]] +; GFX9-NEXT: ret i32 [[OP_RDX1]] ; %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -63,11 +63,11 @@ ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]] ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16 ; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]]) -; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]] +; GFX9-NEXT: [[OP_RDX:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] +; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i64 [[TMP5]], i64 [[SELECT1]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4 ; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8 -; GFX9-NEXT: ret i64 [[OP_EXTRA1]] +; GFX9-NEXT: ret i64 [[OP_RDX1]] ; %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16 %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8 @@ -206,11 +206,11 @@ ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]] ; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]] +; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] +; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP3]], i32 [[SELECT1]] ; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA1]] +; GFX9-NEXT: ret i32 [[OP_RDX1]] ; %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 %elt1 = extractelement <2 x i32> %vload, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3 @@ -20,10 +20,10 @@ ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP10]], 1 -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]] -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP6]] -; CHECK-NEXT: [[OP_EXTRA3]] = add i32 [[OP_EXTRA2]], [[TMP5]] +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3]] = add i32 [[TMP10]], [[OP_RDX2]] ; CHECK-NEXT: br label [[LOOP]] ; CHECK: bail_out: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -7,20 +7,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 ; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> , [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP7]], 0 -; CHECK-NEXT: [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP3]], 0 +; CHECK-NEXT: [[OP_RDX1]] = add i64 [[TMP7]], [[OP_RDX]] ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -5,98 +5,103 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_EXTRA26]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ [[TMP45:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP26]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP28:%.*]] = add <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP28]]) +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP30]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP27]] +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[OP_RDX3]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = and <2 x i32> [[TMP34]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i32> [[TMP37]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP37]], i32 1 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i32> poison, i32 [[TMP38]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x i32> , i32 [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP43:%.*]] = and <2 x i32> [[TMP41]], [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = add <2 x i32> [[TMP41]], [[TMP42]] +; CHECK-NEXT: [[TMP45]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> [[TMP44]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2 +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4 +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5 +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2 +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3 +; FORCE_REDUCTION-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4 +; FORCE_REDUCTION-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5 +; FORCE_REDUCTION-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6 +; FORCE_REDUCTION-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7 +; FORCE_REDUCTION-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8 +; FORCE_REDUCTION-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9 +; FORCE_REDUCTION-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10 +; FORCE_REDUCTION-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11 +; FORCE_REDUCTION-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12 +; FORCE_REDUCTION-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13 +; FORCE_REDUCTION-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14 +; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 -; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555 -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]]) -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]] -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]] -; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 -; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]] -; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = and <2 x i32> [[TMP8]], [[TMP9]] -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]] -; FORCE_REDUCTION-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], +; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) +; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]] +; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX13:%.*]] = and i32 [[TMP0]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_RDX14:%.*]] = and i32 [[TMP0]], [[TMP26]] +; FORCE_REDUCTION-NEXT: [[OP_RDX15:%.*]] = and i32 [[OP_RDX13]], [[OP_RDX14]] +; FORCE_REDUCTION-NEXT: [[OP_RDX16:%.*]] = and i32 [[TMP30]], [[OP_RDX15]] +; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[OP_RDX]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX16]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP32]], [[TMP33]] +; FORCE_REDUCTION-NEXT: [[TMP35:%.*]] = and <2 x i32> [[TMP32]], [[TMP33]] +; FORCE_REDUCTION-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,20 +4,20 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 0 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[V44]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll @@ -60,7 +60,7 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @i( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]]) ; CHECK-NEXT: ret i8 [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -60,7 +60,7 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @i( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]]) ; CHECK-NEXT: ret i8 [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -7,22 +7,20 @@ ; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef ; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] ; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]] -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef -; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef -; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef ; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] ; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef -; CHECK-NEXT: ret i32 [[ADD11]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP0]], [[OP_RDX3]] +; CHECK-NEXT: ret i32 [[OP_RDX4]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -13,58 +13,30 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7 -; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 -; CHECK-NEXT: ret float [[OP_EXTRA1]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]] +; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0 -; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1 -; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2 -; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 -; THRESHOLD-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4 -; THRESHOLD-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5 -; THRESHOLD-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6 -; THRESHOLD-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7 -; THRESHOLD-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]] -; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]] +; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %0 = load i32, i32* @n, align 4 @@ -107,26 +79,30 @@ ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] -; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 -; CHECK-NEXT: ret float [[OP_EXTRA1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV6]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]] +; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @bazz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 -; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 -; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 -; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] -; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP0]], i32 1 +; THRESHOLD-NEXT: [[TMP6:%.*]] = mul nsw <2 x i32> [[TMP5]], +; THRESHOLD-NEXT: [[TMP7:%.*]] = shl nsw <2 x i32> [[TMP5]], +; THRESHOLD-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> +; THRESHOLD-NEXT: [[TMP9:%.*]] = sitofp <2 x i32> [[TMP8]] to <2 x float> +; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP11]], [[TMP12]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP10]], [[OP_RDX]] +; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %0 = load i32, i32* @n, align 4 @@ -358,8 +334,6 @@ ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 ; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 @@ -376,6 +350,8 @@ ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; CHECK-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 ; CHECK-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32 ; CHECK-NEXT: [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33 ; CHECK-NEXT: [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34 @@ -392,10 +368,10 @@ ; CHECK-NEXT: [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45 ; CHECK-NEXT: [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46 ; CHECK-NEXT: [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[OP_RDX]] ; @@ -416,8 +392,6 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 ; THRESHOLD-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 -; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>* -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; THRESHOLD-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 ; THRESHOLD-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 @@ -434,6 +408,8 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; THRESHOLD-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 ; THRESHOLD-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32 ; THRESHOLD-NEXT: [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33 ; THRESHOLD-NEXT: [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34 @@ -450,10 +426,10 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45 ; THRESHOLD-NEXT: [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46 ; THRESHOLD-NEXT: [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47 -; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* -; THRESHOLD-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>* +; THRESHOLD-NEXT: [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; @@ -642,8 +618,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; CHECK-NEXT: ret float [[OP_EXTRA]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f1( ; THRESHOLD-NEXT: entry: @@ -683,8 +659,8 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; THRESHOLD-NEXT: ret float [[OP_EXTRA]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: %rem = srem i32 %a, %b @@ -791,15 +767,11 @@ ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -808,10 +780,10 @@ ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 ; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 ; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19 @@ -820,35 +792,35 @@ ; CHECK-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22 ; CHECK-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23 ; CHECK-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25 ; CHECK-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26 ; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27 ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; CHECK-NEXT: ret float [[TMP12]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX1]], [[OP_RDX2]] +; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 ; THRESHOLD-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; THRESHOLD-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; THRESHOLD-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; THRESHOLD-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -857,10 +829,10 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; THRESHOLD-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; THRESHOLD-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; THRESHOLD-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; THRESHOLD-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 ; THRESHOLD-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 ; THRESHOLD-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19 @@ -869,22 +841,31 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22 ; THRESHOLD-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23 ; THRESHOLD-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24 +; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>* +; THRESHOLD-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25 ; THRESHOLD-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26 ; THRESHOLD-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27 ; THRESHOLD-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 +; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>* +; THRESHOLD-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 +; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]]) -; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]]) +; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 +; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; THRESHOLD-NEXT: ret float [[TMP12]] +; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 +; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP6]], i32 1 +; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP12]], [[TMP14]] +; THRESHOLD-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP16]], [[TMP17]] +; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: %arrayidx = getelementptr inbounds float, float* %x, i64 1 @@ -984,7 +965,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -995,15 +975,15 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; CHECK-NEXT: ret float [[OP_EXTRA1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]] +; CHECK-NEXT: ret float [[OP_RDX2]] ; ; THRESHOLD-LABEL: @extra_args( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 ; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -1014,9 +994,10 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]] +; THRESHOLD-NEXT: ret float [[OP_RDX2]] ; entry: %mul = mul nsw i32 %b, %a @@ -1054,7 +1035,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -1065,17 +1045,16 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00 -; CHECK-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] -; CHECK-NEXT: ret float [[OP_EXTRA3]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float 5.000000e+00, [[CONV]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float 8.000000e+00, [[OP_RDX]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]] +; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 ; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -1086,11 +1065,11 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 -; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00 -; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] -; THRESHOLD-NEXT: ret float [[OP_EXTRA3]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float 5.000000e+00, [[CONV]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float 8.000000e+00, [[OP_RDX]] +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]] +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]] +; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: %mul = mul nsw i32 %b, %a @@ -1131,8 +1110,6 @@ ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float -; CHECK-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]] ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -1143,17 +1120,17 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; CHECK-NEXT: ret float [[OP_EXTRA1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONVC]], [[CONV]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], 3.000000e+00 +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]] +; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float -; THRESHOLD-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]] ; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 @@ -1164,9 +1141,15 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] -; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] -; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[CONVC]], i32 0 +; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP3]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]] +; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: %mul = mul nsw i32 %b, %a @@ -1265,9 +1248,9 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]] -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]] -; CHECK-NEXT: ret i32 [[OP_EXTRA2]] +; CHECK-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]] +; CHECK-NEXT: ret i32 [[OP_RDX2]] ; ; THRESHOLD-LABEL: @wobble( ; THRESHOLD-NEXT: bb: @@ -1280,9 +1263,9 @@ ; THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer ; THRESHOLD-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; THRESHOLD-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]] -; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]] -; THRESHOLD-NEXT: ret i32 [[OP_EXTRA2]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]] +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]] +; THRESHOLD-NEXT: ret i32 [[OP_RDX2]] ; bb: %x1 = xor i32 %arg, %bar diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -873,15 +873,15 @@ ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 ; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX-NEXT: store i32 [[TMP14]], i32* @var, align 8 -; AVX-NEXT: ret i32 [[OP_EXTRA1]] +; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]] +; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]] +; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]] +; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]] +; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]] +; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; AVX-NEXT: store i32 [[TMP10]], i32* @var, align 8 +; AVX-NEXT: ret i32 [[OP_RDX5]] ; ; AVX2-LABEL: @maxi8_mutiple_uses( ; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -892,15 +892,15 @@ ; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 ; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX2-NEXT: store i32 [[TMP14]], i32* @var, align 8 -; AVX2-NEXT: ret i32 [[OP_EXTRA1]] +; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]] +; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]] +; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]] +; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]] +; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]] +; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; AVX2-NEXT: store i32 [[TMP10]], i32* @var, align 8 +; AVX2-NEXT: ret i32 [[OP_RDX5]] ; ; THRESH-LABEL: @maxi8_mutiple_uses( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 @@ -909,23 +909,23 @@ ; THRESH-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) -; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]] -; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]] -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] -; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] -; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP17]], i32 [[TMP18]] -; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 -; THRESH-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4 -; THRESH-NEXT: store i32 [[TMP20]], i32* @var, align 8 -; THRESH-NEXT: ret i32 [[OP_EXTRA1]] +; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP9]], [[TMP11]] +; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]] +; THRESH-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; THRESH-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]] +; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP15]], i32 [[TMP16]] +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[OP_RDX3]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[OP_RDX3]] +; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 3, i32 4 +; THRESH-NEXT: store i32 [[TMP18]], i32* @var, align 8 +; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -1058,13 +1058,13 @@ ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 ; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] -; AVX-NEXT: ret i32 [[OP_EXTRA1]] +; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]] +; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]] +; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]] +; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]] +; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]] +; AVX-NEXT: ret i32 [[OP_RDX5]] ; ; AVX2-LABEL: @maxi8_wrong_parent( ; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -1077,13 +1077,13 @@ ; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 ; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] -; AVX2-NEXT: ret i32 [[OP_EXTRA1]] +; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]] +; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]] +; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]] +; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]] +; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]] +; AVX2-NEXT: ret i32 [[OP_RDX5]] ; ; THRESH-LABEL: @maxi8_wrong_parent( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 @@ -1093,24 +1093,25 @@ ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: ; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; THRESH-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; THRESH-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[TMP12]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 -; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 -; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]] -; THRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 -; THRESH-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 -; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] -; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP20]], i32 [[TMP21]] -; THRESH-NEXT: ret i32 [[OP_EXTRA1]] +; THRESH-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 +; THRESH-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 +; THRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i1> poison, i1 [[OP_RDX]], i32 0 +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> [[TMP10]], i1 [[TMP5]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0 +; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP13]], <2 x i32> [[TMP15]] +; THRESH-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 +; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]] +; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP18]], i32 [[TMP19]] +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[OP_RDX3]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[OP_RDX3]] +; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -1480,8 +1481,8 @@ ; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] ; AVX-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) -; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) +; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) ; AVX-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) ; AVX-NEXT: ret void ; @@ -1490,8 +1491,8 @@ ; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] ; AVX2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) -; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) +; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) ; AVX2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) ; AVX2-NEXT: ret void ; @@ -1500,8 +1501,8 @@ ; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] ; THRESH-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) -; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) +; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) ; THRESH-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) ; THRESH-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -10,23 +10,23 @@ ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[SUB102_1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD94_1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SUB86_1]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[ADD78_2]], i32 5 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[SUB102_3]], i32 5 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD78_1]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD94_1]], i32 2 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_1]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[ADD78_2]], i32 4 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SUB102_3]], i32 5 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = lshr <16 x i32> [[TMP14]], ; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i32> [[TMP15]], ; CHECK-NEXT: [[TMP17:%.*]] = mul nuw <16 x i32> [[TMP16]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s -; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX512VL declare void @use1(i1) @@ -200,22 +200,14 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[X]], -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[TMP4]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[TMP3]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[TMP2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[TMP1]], 17 -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -241,27 +233,16 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: call void @use1(i1 [[C2]]) -; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: call void @use1(i1 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -288,27 +269,23 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP4]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[X]], ; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false ; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false ; CHECK-NEXT: call void @use1(i1 [[S2]]) -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C3]], i1 [[S2]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -334,31 +311,65 @@ } define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { -; CHECK-LABEL: @logical_and_icmp_clamp_v8i32( -; CHECK-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 -; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 -; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], -; CHECK-NEXT: [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]] -; CHECK-NEXT: [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]] -; CHECK-NEXT: [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]] -; CHECK-NEXT: [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]] -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; SSE-LABEL: @logical_and_icmp_clamp_v8i32( +; SSE-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 +; SSE-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 +; SSE-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 +; SSE-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 +; SSE-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 +; SSE-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 +; SSE-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 +; SSE-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X0]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[X1]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[X2]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[X3]], i32 3 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[Y0]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[Y1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[Y2]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[Y3]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[TMP9]], [[TMP13]] +; SSE-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP5]] +; SSE-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP15]]) +; SSE-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP14]] +; SSE-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP17]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP16]], i1 [[TMP18]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX512VL-LABEL: @logical_and_icmp_clamp_v8i32( +; AVX512VL-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 +; AVX512VL-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 +; AVX512VL-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 +; AVX512VL-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 +; AVX512VL-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 +; AVX512VL-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 +; AVX512VL-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 +; AVX512VL-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 +; AVX512VL-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX512VL-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX512VL-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX512VL-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y0]], i32 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y1]], i32 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y2]], i32 2 +; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y3]], i32 3 +; AVX512VL-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP8]] +; AVX512VL-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP9]] +; AVX512VL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]]) +; AVX512VL-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; AVX512VL-NEXT: [[OP_RDX1:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false +; AVX512VL-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX]], i1 [[OP_RDX1]], i1 false +; AVX512VL-NEXT: [[OP_RDX3:%.*]] = select i1 [[TMP11]], i1 [[OP_RDX2]], i1 false +; AVX512VL-NEXT: ret i1 [[OP_RDX3]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 @@ -388,24 +399,19 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_partial( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false +; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX2]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -431,26 +437,22 @@ define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP4]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C3:%.*]] = icmp ult i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false +; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX]], i1 [[OP_RDX1]], i1 false +; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX2]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX3]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -480,8 +482,8 @@ ; CHECK-NEXT: [[S3:%.*]] = select i1 [[C:%.*]], i1 [[C]], i1 false ; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false -; CHECK-NEXT: ret i1 [[OP_EXTRA]] +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll @@ -31,15 +31,15 @@ ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]] +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[OP_EXTRA]] +; CHECK-NEXT: ret i32 [[OP_RDX]] ; entry: %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1 @@ -118,17 +118,17 @@ ; CHECK-NEXT: [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]] +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[OP_EXTRA]] +; CHECK-NEXT: ret i32 [[OP_RDX]] ; entry: %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1 @@ -223,18 +223,18 @@ ; CHECK-NEXT: [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[SHUFFLE]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]] +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[OP_EXTRA]] +; CHECK-NEXT: ret i32 [[OP_RDX]] ; entry: %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -11,9 +11,9 @@ ; } ; Vector cost is 5, Scalar cost is 7 -; AVX: Adding cost -2 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction) +; AVX: Adding cost -2 for reduction that starts with %0 = load i32, i32* %p, align 4 (It is a splitting reduction) ; Vector cost is 6, Scalar cost is 7 -; SSE: Adding cost -1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction) +; SSE: Adding cost -1 for reduction that starts with %0 = load i32, i32* %p, align 4 (It is a splitting reduction) define i32 @test_add(i32* nocapture readonly %p) { ; CHECK-LABEL: @test_add( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 @@ -24,18 +24,13 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP9]], undef -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP9]], i32 undef -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef -; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[OP_EXTRA4]], i32 [[OP_EXTRA3]], i32 undef -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[OP_EXTRA6]], i32 [[OP_EXTRA5]], i32 undef -; CHECK-NEXT: [[OP_EXTRA8:%.*]] = icmp slt i32 [[OP_EXTRA7]], undef -; CHECK-NEXT: [[OP_EXTRA9:%.*]] = select i1 [[OP_EXTRA8]], i32 [[OP_EXTRA7]], i32 undef -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA9]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef +; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]] ; CHECK-NEXT: unreachable ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll @@ -24,26 +24,21 @@ ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[I5:%.*]] = add i32 undef, undef ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[TMP2]], [[I5]] -; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_EXTRA2]], undef -; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_EXTRA2]], [[I10]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = add i32 [[TMP2]], undef ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[I12:%.*]] = add i32 undef, undef ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], [[I12]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_EXTRA1]], [[I11]] -; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_EXTRA1]], [[I18]] -; CHECK-NEXT: [[I20:%.*]] = add i32 undef, [[I19]] -; CHECK-NEXT: [[I21:%.*]] = add i32 undef, [[I20]] -; CHECK-NEXT: [[I22:%.*]] = add i32 undef, [[I21]] -; CHECK-NEXT: [[I23:%.*]] = add i32 undef, [[I22]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[OP_RDX6]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX5]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[OP_RDX5]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP6]], [[OP_RDX3]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I23]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll @@ -17,11 +17,9 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[TMP4]], i32 [[OP_EXTRA]], i32 undef -; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_EXTRA1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef +; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]] ; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll @@ -50,14 +50,14 @@ ; CHECK-NEXT: [[TMP34:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP32]] ; CHECK-NEXT: [[TMP35:%.*]] = select <4 x i1> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> [[TMP32]] ; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP35]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP36]], [[B_0]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP36]], i32 [[B_0]] +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP36]], [[B_0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP36]], i32 [[B_0]] ; CHECK-NEXT: [[SUB_116:%.*]] = sub i32 [[TMP30]], [[TMP1]] ; CHECK-NEXT: [[TMP37:%.*]] = icmp slt i32 [[SUB_116]], 0 ; CHECK-NEXT: [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]] ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[NEG_117]], i32 [[SUB_116]] -; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP38]], [[OP_EXTRA1]] -; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP38]], i32 [[OP_EXTRA1]] +; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP38]], [[OP_RDX1]] +; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP38]], i32 [[OP_RDX1]] ; CHECK-NEXT: [[SUB_1_1:%.*]] = sub i32 [[TMP30]], [[TMP2]] ; CHECK-NEXT: [[TMP39:%.*]] = icmp slt i32 [[SUB_1_1]], 0 ; CHECK-NEXT: [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 @@ -57,9 +57,9 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 @@ -111,9 +111,9 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll @@ -43,7 +43,7 @@ define void @test2() { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> poison, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)