diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5971,27 +5971,48 @@ TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef VL, unsigned OpIdx) { assert(!VL.empty()); - const auto *Op0 = cast(VL.front())->getOperand(OpIdx); + const auto *I0 = cast(*find_if(VL, Instruction::classof)); + const auto *Op0 = I0->getOperand(OpIdx); const bool IsConstant = all_of(VL, [&](Value *V) { // TODO: We should allow undef elements here - auto *Op = cast(V)->getOperand(OpIdx); + const auto *I = dyn_cast(V); + if (!I) + return true; + auto *Op = I->getOperand(OpIdx); return isConstant(Op) && !isa(Op); }); const bool IsUniform = all_of(VL, [&](Value *V) { // TODO: We should allow undef elements here - return cast(V)->getOperand(OpIdx) == Op0; + const auto *I = dyn_cast(V); + if (!I) + return false; + return I->getOperand(OpIdx) == Op0; }); const bool IsPowerOfTwo = all_of(VL, [&](Value *V) { // TODO: We should allow undef elements here - auto *Op = cast(V)->getOperand(OpIdx); + const auto *I = dyn_cast(V); + if (!I) { + assert((isa(V) || + I0->getOpcode() == Instruction::GetElementPtr) && + "Expected undef or GEP."); + return true; + } + auto *Op = I->getOperand(OpIdx); if (auto *CI = dyn_cast(Op)) return CI->getValue().isPowerOf2(); return false; }); const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) { // TODO: We should allow undef elements here - auto *Op = cast(V)->getOperand(OpIdx); + const auto *I = dyn_cast(V); + if (!I) { + assert((isa(V) || + I0->getOpcode() == Instruction::GetElementPtr) && + "Expected undef or GEP."); + return true; + } + const auto *Op = I->getOperand(OpIdx); if (auto *CI = dyn_cast(Op)) return CI->getValue().isNegatedPowerOf2(); return false; @@ -6306,243 +6327,267 @@ Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); + const unsigned Sz = VL.size(); + auto &&GetCostDiff = + [this, E, Sz, CommonCost, + VL0](function_ref ScalarEltCost, + function_ref VectorCost) { + // Calculate the cost of this instruction. + InstructionCost ScalarCost = 0; + if (isa(VL0)) { + // For some of the instructions no need to calculate cost for each + // particular instruction, we can use the cost of the single + // instruction x total number of scalar instructions. + ScalarCost = Sz * ScalarEltCost(0); + } else { + for (unsigned I = 0; I < Sz; ++I) + ScalarCost += ScalarEltCost(I); + } + + InstructionCost VecCost = VectorCost(CommonCost); + LLVM_DEBUG( + dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost)); + // Disable warnings for `this` and `E` are unused. Required for + // `dumpTreeCosts`. + (void)this; + (void)E; + return VecCost - ScalarCost; + }; switch (ShuffleOrOp) { - case Instruction::PHI: - return 0; + case Instruction::PHI: { + // Count reused scalars. + InstructionCost ScalarCost = 0; + SmallPtrSet CountedOps; + for (Value *V : VL) { + auto *PHI = dyn_cast(V); + if (!PHI) + continue; - case Instruction::ExtractValue: - case Instruction::ExtractElement: { - // The common cost of removal ExtractElement/ExtractValue instructions + - // the cost of shuffles, if required to resuffle the original vector. - if (NeedToShuffleReuses) { - unsigned Idx = 0; - for (unsigned I : E->ReuseShuffleIndices) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast(VL[I]); - CommonCost -= TTI->getVectorInstrCost( - *EE, EE->getVectorOperandType(), *getExtractIndex(EE)); - } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - ++Idx; - } - } - Idx = EntryVF; - for (Value *V : VL) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast(V); - CommonCost += TTI->getVectorInstrCost( - *EE, EE->getVectorOperandType(), *getExtractIndex(EE)); - } else { - --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - } - } - } - if (ShuffleOrOp == Instruction::ExtractValue) { - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - auto *EI = cast(VL[I]); - // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); - if (isa(Ext) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - CommonCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; - } - } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); - } + ValueList Operands(PHI->getNumIncomingValues(), nullptr); + for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { + Value *Op = PHI->getIncomingValue(I); + Operands[I] = Op; + } + if (const TreeEntry *OpTE = getTreeEntry(Operands.front())) + if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second) + if (!OpTE->ReuseShuffleIndices.empty()) + ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - + OpTE->Scalars.size()); + } + + return CommonCost - ScalarCost; + } + case Instruction::ExtractValue: + case Instruction::ExtractElement: { + auto &&GetScalarCost = [this, ShuffleOrOp, VL, ScalarTy, + CostKind](unsigned Idx) { + auto *I = cast(VL[Idx]); + VectorType *SrcVecTy; + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(I); + SrcVecTy = EE->getVectorOperandType(); } else { - AdjustExtractsCost(CommonCost); + auto *EV = cast(I); + Type *AggregateTy = EV->getAggregateOperand()->getType(); + unsigned NumElts; + if (auto *ATy = dyn_cast(AggregateTy)) + NumElts = ATy->getNumElements(); + else + NumElts = AggregateTy->getStructNumElements(); + SrcVecTy = FixedVectorType::get(ScalarTy, NumElts); + } + if (I->hasOneUse()) { + Instruction *Ext = I->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + InstructionCost Cost = TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I)); + // Subtract the cost of s|zext which is subtracted separately. + Cost -= TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), I->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + return Cost; + } } + return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, + *getExtractIndex(I)); + }; + auto &&GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; - } - case Instruction::InsertElement: { - assert(E->ReuseShuffleIndices.empty() && - "Unique insertelements only are expected."); - auto *SrcVecTy = cast(VL0->getType()); - unsigned const NumElts = SrcVecTy->getNumElements(); - unsigned const NumScalars = VL.size(); - - unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); - - SmallVector InsertMask(NumElts, UndefMaskElem); - unsigned OffsetBeg = *getInsertIndex(VL.front()); - unsigned OffsetEnd = OffsetBeg; - InsertMask[OffsetBeg] = 0; - for (auto [I, V] : enumerate(VL.drop_front())) { - unsigned Idx = *getInsertIndex(V); - if (OffsetBeg > Idx) - OffsetBeg = Idx; - else if (OffsetEnd < Idx) - OffsetEnd = Idx; - InsertMask[Idx] = I + 1; - } - unsigned VecScalarsSz = PowerOf2Ceil(NumElts); - if (NumOfParts > 0) - VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); - unsigned VecSz = - (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * - VecScalarsSz; - unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); - unsigned InsertVecSz = std::min( - PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), - ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * - VecScalarsSz); - bool IsWholeSubvector = - OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); - // Check if we can safely insert a subvector. If it is not possible, just - // generate a whole-sized vector and shuffle the source vector and the new - // subvector. - if (OffsetBeg + InsertVecSz > VecSz) { - // Align OffsetBeg to generate correct mask. - OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); - InsertVecSz = VecSz; - } - - APInt DemandedElts = APInt::getZero(NumElts); - // TODO: Add support for Instruction::InsertValue. - SmallVector Mask; - if (!E->ReorderIndices.empty()) { - inversePermutation(E->ReorderIndices, Mask); - Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && + "Unique insertelements only are expected."); + auto *SrcVecTy = cast(VL0->getType()); + unsigned const NumElts = SrcVecTy->getNumElements(); + unsigned const NumScalars = VL.size(); + + unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + + SmallVector InsertMask(NumElts, UndefMaskElem); + unsigned OffsetBeg = *getInsertIndex(VL.front()); + unsigned OffsetEnd = OffsetBeg; + InsertMask[OffsetBeg] = 0; + for (auto [I, V] : enumerate(VL.drop_front())) { + unsigned Idx = *getInsertIndex(V); + if (OffsetBeg > Idx) + OffsetBeg = Idx; + else if (OffsetEnd < Idx) + OffsetEnd = Idx; + InsertMask[Idx] = I + 1; + } + unsigned VecScalarsSz = PowerOf2Ceil(NumElts); + if (NumOfParts > 0) + VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); + unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * + VecScalarsSz; + unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); + unsigned InsertVecSz = std::min( + PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), + ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); + bool IsWholeSubvector = + OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); + // Check if we can safely insert a subvector. If it is not possible, just + // generate a whole-sized vector and shuffle the source vector and the new + // subvector. + if (OffsetBeg + InsertVecSz > VecSz) { + // Align OffsetBeg to generate correct mask. + OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); + InsertVecSz = VecSz; + } + + APInt DemandedElts = APInt::getZero(NumElts); + // TODO: Add support for Instruction::InsertValue. + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); + } else { + Mask.assign(VecSz, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); + } + bool IsIdentity = true; + SmallVector PrevMask(InsertVecSz, UndefMaskElem); + Mask.swap(PrevMask); + for (unsigned I = 0; I < NumScalars; ++I) { + unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); + DemandedElts.setBit(InsertIdx); + IsIdentity &= InsertIdx - OffsetBeg == I; + Mask[InsertIdx - OffsetBeg] = I; + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + InstructionCost Cost = 0; + Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, + /*Insert*/ true, /*Extract*/ false); + + // First cost - resize to actual vector size if not identity shuffle or + // need to shift the vector. + // Do not calculate the cost if the actual size is the register size and + // we can merge this shuffle with the following SK_Select. + auto *InsertVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); + if (!IsIdentity) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + InsertVecTy, Mask); + auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast(V)->getOperand(0)); + })); + // Second cost - permutation with subvector, if some elements are from the + // initial vector or inserting a subvector. + // TODO: Implement the analysis of the FirstInsert->getOperand(0) + // subvector of ActualVecTy. + SmallBitVector InMask = + isUndefVector(FirstInsert->getOperand(0), InsertMask); + if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { + if (InsertVecSz != VecSz) { + auto *ActualVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None, + CostKind, OffsetBeg - Offset, InsertVecTy); } else { - Mask.assign(VecSz, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); - } - bool IsIdentity = true; - SmallVector PrevMask(InsertVecSz, UndefMaskElem); - Mask.swap(PrevMask); - for (unsigned I = 0; I < NumScalars; ++I) { - unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); - DemandedElts.setBit(InsertIdx); - IsIdentity &= InsertIdx - OffsetBeg == I; - Mask[InsertIdx - OffsetBeg] = I; - } - assert(Offset < NumElts && "Failed to find vector index offset"); - - InstructionCost Cost = 0; - Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, - /*Insert*/ true, /*Extract*/ false); - - // First cost - resize to actual vector size if not identity shuffle or - // need to shift the vector. - // Do not calculate the cost if the actual size is the register size and - // we can merge this shuffle with the following SK_Select. - auto *InsertVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); - if (!IsIdentity) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - InsertVecTy, Mask); - auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, cast(V)->getOperand(0)); - })); - // Second cost - permutation with subvector, if some elements are from the - // initial vector or inserting a subvector. - // TODO: Implement the analysis of the FirstInsert->getOperand(0) - // subvector of ActualVecTy. - SmallBitVector InMask = - isUndefVector(FirstInsert->getOperand(0), InsertMask); - if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { - if (InsertVecSz != VecSz) { - auto *ActualVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), VecSz); - Cost += - TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None, - CostKind, OffsetBeg - Offset, InsertVecTy); - } else { - for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) - Mask[I] = InMask.test(I) ? UndefMaskElem : I; - for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; - I <= End; ++I) - if (Mask[I] != UndefMaskElem) - Mask[I] = I + VecSz; - for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) - Mask[I] = InMask.test(I) ? UndefMaskElem : I; - Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); - } + for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) + Mask[I] = InMask.test(I) ? UndefMaskElem : I; + for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; + I <= End; ++I) + if (Mask[I] != UndefMaskElem) + Mask[I] = I + VecSz; + for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) + Mask[I] = InMask.test(I) ? UndefMaskElem : I; + Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } - return Cost; } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { + return Cost; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto &&GetScalarCost = [this, E, ScalarTy, VL, CostKind](unsigned Idx) { + auto *VI = cast(VL[Idx]); + return TTI->getCastInstrCost(E->getOpcode(), ScalarTy, + VI->getOperand(0)->getType(), + TTI::getCastContextHint(VI), CostKind, VI); + }; + auto &&GetVectorCost = [this, VL, VL0, VecTy, E, + CostKind](InstructionCost CommonCost) { Type *SrcTy = VL0->getOperand(0)->getType(); - InstructionCost ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - - // Calculate the cost of this instruction. - InstructionCost ScalarCost = VL.size() * ScalarEltCost; - auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - InstructionCost VecCost = 0; + InstructionCost VecCost = CommonCost; // Check if the values are candidates to demote. - if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = CommonCost + TTI->getCastInstrCost( - E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return VecCost - ScalarCost; - } - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: { - // Calculate the cost of this instruction. - InstructionCost ScalarEltCost = - TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) + VecCost += + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); + return VecCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + CmpInst::Predicate VecPred, SwappedVecPred; + auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); + if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || + match(VL0, MatchCmp)) + SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); + else + SwappedVecPred = VecPred = CmpInst::BAD_ICMP_PREDICATE; + auto &&GetScalarCost = [this, E, ScalarTy, VL, CostKind, &VecPred, + &SwappedVecPred](unsigned Idx) { + auto *VI = cast(VL[Idx]); + CmpInst::Predicate CurrentPred; + auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && + !match(VI, MatchCmp)) || + (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) + VecPred = SwappedVecPred = CmpInst::BAD_ICMP_PREDICATE; + + return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), CurrentPred, CostKind, + VI); + }; + auto &&GetVectorCost = [this, VL, VL0, VecTy, E, CostKind, + &VecPred](InstructionCost CommonCost) { auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - - // Check if all entries in VL are either compares or selects with compares - // as condition that have the same predicates. - CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; - bool First = true; - for (auto *V : VL) { - CmpInst::Predicate CurrentPred; - auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); - if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && - !match(V, MatchCmp)) || - (!First && VecPred != CurrentPred)) { - VecPred = CmpInst::BAD_ICMP_PREDICATE; - break; - } - First = false; - VecPred = CurrentPred; - } InstructionCost VecCost = TTI->getCmpSelInstrCost( E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); - // Check if it is possible and profitable to use min/max for selects in - // VL. + // Check if it is possible and profitable to use min/max for selects + // in VL. // auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { @@ -6550,220 +6595,188 @@ {VecTy, VecTy}); InstructionCost IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - // If the selects are the only uses of the compares, they will be dead - // and we can adjust the cost by removing their cost. + // If the selects are the only uses of the compares, they will be + // dead and we can adjust the cost by removing their cost. if (IntrinsicAndUse.second) IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, VecPred, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::FNeg: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - const unsigned OpIdx = isa(VL0) ? 1 : 0; - - InstructionCost ScalarCost = 0; - for (auto *V : VL) { - auto *VI = cast(V); - TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); - TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(VI->getOperand(OpIdx)); - SmallVector Operands(VI->operand_values()); - ScalarCost += - TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, - Op1Info, Op2Info, Operands, VI); - } - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarCost/VL.size(); - } + return VecCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::GetElementPtr: { + unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr + ? Instruction::Add + : ShuffleOrOp; + auto &&GetScalarCost = [this, ScalarTy, VL, CostKind, + Opcode](unsigned Idx) { + auto *VI = dyn_cast(VL[Idx]); + // GEPs may contain just addresses without instructions, consider + // their cost 0. + if (!VI) + return InstructionCost(); + unsigned OpIdx = isa(VI) ? 0 : 1; + TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); + TTI::OperandValueInfo Op2Info = + TTI::getOperandInfo(VI->getOperand(OpIdx)); + SmallVector Operands(VI->operand_values()); + return TTI->getArithmeticInstrCost(Opcode, ScalarTy, CostKind, Op1Info, + Op2Info, Operands, VI); + }; + auto &&GetVectorCost = [this, VecTy, CostKind, VL0, VL, + Opcode](InstructionCost CommonCost) { + unsigned OpIdx = isa(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0); TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx); - InstructionCost VecCost = - TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, - Op1Info, Op2Info); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - any_of(VL, - [](Value *V) { - return isa(V) && - !isConstant( - cast(V)->getOperand(1)); - }) - ? TargetTransformInfo::OK_AnyValue - : TargetTransformInfo::OK_UniformConstantValue; - - InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( - Instruction::Add, ScalarTy, CostKind, - {Op1VK, TargetTransformInfo::OP_None}, - {Op2VK, TargetTransformInfo::OP_None}); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = TTI->getArithmeticInstrCost( - Instruction::Add, VecTy, CostKind, - {Op1VK, TargetTransformInfo::OP_None}, - {Op2VK, TargetTransformInfo::OP_None}); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - Align Alignment = cast(VL0)->getAlign(); - InstructionCost ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0, - CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + return TTI->getArithmeticInstrCost(Opcode, VecTy, CostKind, Op1Info, + Op2Info) + + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::Load: { + auto &&GetScalarCost = [this, ScalarTy, VL, CostKind, VL0](unsigned Idx) { + auto *VI = cast(VL[Idx]); + InstructionCost GEPCost = 0; + if (VI != VL0) { + auto *Ptr = dyn_cast(VI->getPointerOperand()); + if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices()) + GEPCost = TTI->getArithmeticInstrCost(Instruction::Add, + Ptr->getType(), CostKind); + } + return GEPCost + + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), VI); + }; + auto &&GetVectorCost = [this, VL, VL0, VecTy, E, + CostKind](InstructionCost CommonCost) { + auto *LI0 = cast(VL0); InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { - VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, - CostKind, TTI::OperandValueInfo(), VL0); - for (Value *V : VL) { - auto *VI = cast(V); - // Add the costs of scalar GEP pointers, to be removed from the code. - if (VI == VL0) - continue; - auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) - continue; - ScalarLdCost += TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); - } + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, VecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); - Align CommonAlignment = Alignment; + Align CommonAlignment = LI0->getAlign(); for (Value *V : VL) CommonAlignment = std::min(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( - Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, VL0); + Instruction::Load, VecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); - return CommonCost + VecLdCost - ScalarLdCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = - cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); - Align Alignment = SI->getAlign(); - InstructionCost ScalarStCost = 0; - for (auto *V : VL) { - auto *VI = cast(V); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getOperand(0)); - ScalarStCost += - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, - CostKind, OpInfo, VI); - // Add the costs of scalar GEP pointers, to be removed from the code. - if (VI == SI) - continue; + return VecLdCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::Store: { + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); + auto &&GetScalarCost = [this, ScalarTy, VL, CostKind, SI](unsigned Idx) { + auto *VI = cast(VL[Idx]); + InstructionCost GEPCost = 0; + if (VI != SI) { auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) - continue; - ScalarStCost += TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); - } + if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices()) + GEPCost = TTI->getArithmeticInstrCost(Instruction::Add, + Ptr->getType(), CostKind); + } + TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0); + return GEPCost + TTI->getMemoryOpCost( + Instruction::Store, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, OpInfo, VI); + }; + auto &&GetVectorCost = [this, VL, VecTy, CostKind, + SI](InstructionCost CommonCost) { + // We know that we can merge the stores. Calculate the cost. TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0); - InstructionCost VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, 0, CostKind, - OpInfo); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); - return CommonCost + VecStCost - ScalarStCost; - } - case Instruction::Call: { - CallInst *CI = cast(VL0); + return TTI->getMemoryOpCost(Instruction::Store, VecTy, SI->getAlign(), + SI->getPointerAddressSpace(), CostKind, + OpInfo) + + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::Call: { + auto &&GetScalarCost = [this, VL, CostKind](unsigned Idx) { + auto *CI = cast(VL[Idx]); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1); - InstructionCost ScalarEltCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; + if (ID != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(ID, *CI, 1); + return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); } - InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - + return TTI->getCallInstrCost(CI->getCalledFunction(), + CI->getFunctionType()->getReturnType(), + CI->getFunctionType()->params(), CostKind); + }; + auto &&GetVectorCost = [this, VL0, VecTy](InstructionCost CommonCost) { + auto *CI = cast(VL0); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); - InstructionCost VecCallCost = - std::min(VecCallCosts.first, VecCallCosts.second); - - LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); - - return CommonCost + VecCallCost - ScalarCallCost; - } - case Instruction::ShuffleVector: { - assert(E->isAltShuffle() && - ((Instruction::isBinaryOp(E->getOpcode()) && - Instruction::isBinaryOp(E->getAltOpcode())) || - (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode())) || - (isa(VL0) && isa(E->getAltOp()))) && - "Invalid Shuffle Vector Operand"); - InstructionCost ScalarCost = 0; - if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); - CommonCost -= TTI->getInstructionCost(I, CostKind); - } - for (Value *V : VL) { - Instruction *I = cast(V); - CommonCost += TTI->getInstructionCost(I, CostKind); - } - } - for (Value *V : VL) { - Instruction *I = cast(V); - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - ScalarCost += TTI->getInstructionCost(I, CostKind); + return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::ShuffleVector: { + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode())) || + (isa(VL0) && isa(E->getAltOp()))) && + "Invalid Shuffle Vector Operand"); + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; } + return false; + }; + auto &&GetScalarCost = [this, VL, CostKind, E](unsigned Idx) { + auto *VI = cast(VL[Idx]); + assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); + (void)E; + return TTI->getInstructionCost(VI, CostKind); + }; + // Need to clear CommonCost since the final shuffle cost is included into + // vector cost. + auto &&GetVectorCost = [this, VecTy, E, CostKind, VL, FinalVecTy, + &TryFindNodeWithEqualOperands, VL0, + ScalarTy](InstructionCost) { // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - // Try to find the previous shuffle node with the same operands and same - // main/alternate ops. - auto &&TryFindNodeWithEqualOperands = [this, E]() { - for (const std::unique_ptr &TE : VectorizableTree) { - if (TE.get() == E) - break; - if (TE->isAltShuffle() && - ((TE->getOpcode() == E->getOpcode() && - TE->getAltOpcode() == E->getAltOpcode()) || - (TE->getOpcode() == E->getAltOpcode() && - TE->getAltOpcode() == E->getOpcode())) && - TE->hasEqualOperands(*E)) - return true; - } - return false; - }; if (TryFindNodeWithEqualOperands()) { LLVM_DEBUG({ dbgs() << "SLP: diamond match for alternate node found.\n"; @@ -6773,8 +6786,8 @@ // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); + VecCost += + TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); } else if (auto *CI0 = dyn_cast(VL0)) { VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), @@ -6793,9 +6806,9 @@ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - + SmallVector Mask; if (E->ReuseShuffleIndices.empty()) { - CommonCost = + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); } else { SmallVector Mask; @@ -6806,14 +6819,15 @@ return I->getOpcode() == E->getAltOpcode(); }, Mask); - CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - default: - llvm_unreachable("Unknown instruction"); + return VecCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + default: + llvm_unreachable("Unknown instruction"); } } @@ -7302,11 +7316,6 @@ if (isa(EU.Scalar->getType())) continue; - // Already counted the cost for external uses when tried to adjust the cost - // for extractelements, no need to add it again. - if (isa(EU.Scalar)) - continue; - // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null(EU.User)) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll @@ -107,25 +107,14 @@ define void @select_ule_ugt_mix_4xi32(i32* %ptr, i32 %x) { ; CHECK-LABEL: @select_ule_ugt_mix_4xi32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4 -; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i32 [[L_0]], 16383 -; CHECK-NEXT: [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 16383 -; CHECK-NEXT: store i32 [[S_0]], i32* [[PTR]], align 4 -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1 -; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp ugt i32 [[L_1]], 16383 -; CHECK-NEXT: [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 16383 -; CHECK-NEXT: store i32 [[S_1]], i32* [[GEP_1]], align 4 -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4 -; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i32 [[L_2]], 16383 -; CHECK-NEXT: [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 16383 -; CHECK-NEXT: store i32 [[S_2]], i32* [[GEP_2]], align 4 -; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3 -; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4 -; CHECK-NEXT: [[CMP_3:%.*]] = icmp ugt i32 [[L_3]], 16383 -; CHECK-NEXT: [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 16383 -; CHECK-NEXT: store i32 [[S_3]], i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[PTR]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -53,13 +53,17 @@ define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_extract( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3 -; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 +; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8 +; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[LD]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll @@ -8,13 +8,22 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64 +; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP44]], i32 [[TMP4]], i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]] +; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64 ; CHECK-NEXT: ret i64 [[TMP65]] ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll @@ -6,14 +6,24 @@ define i16 @D134605() { ; CHECK-LABEL: @D134605( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[SHUFFLE]], i32 6 -; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP2]], poison -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i16 [[OP_RDX]], poison -; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX1]], 2 +; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr poison, align 1 +; CHECK-NEXT: [[ARRAYIDX101:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX101]], align 1 +; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX107]], align 1 +; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]] +; CHECK-NEXT: [[ADD116:%.*]] = add i16 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ADD122:%.*]] = add i16 [[ADD116]], [[TMP2]] +; CHECK-NEXT: [[ADD124:%.*]] = add i16 [[ADD122]], [[TMP3]] +; CHECK-NEXT: [[ADD125:%.*]] = add i16 [[ADD124]], poison +; CHECK-NEXT: [[FACTOR2531:%.*]] = add i16 [[TMP3]], [[ADD125]] +; CHECK-NEXT: [[ADD14332:%.*]] = add i16 [[FACTOR2531]], [[TMP2]] +; CHECK-NEXT: [[ADD14933:%.*]] = add i16 [[ADD14332]], [[TMP1]] +; CHECK-NEXT: [[ADD15534:%.*]] = add i16 [[ADD14933]], [[TMP0]] +; CHECK-NEXT: [[ADD15935:%.*]] = add i16 [[ADD15534]], poison +; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[ADD15935]], 2 ; CHECK-NEXT: [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120) ; CHECK-NEXT: unreachable ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -4,20 +4,32 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]] +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]] +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4 +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1