diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4051,8 +4051,9 @@ for (Value *V : VL) CommonAlignment = commonAlignment(CommonAlignment, cast(V)->getAlign()); - if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && + !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) return LoadsState::ScatterVectorize; } @@ -5563,70 +5564,96 @@ Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); + auto &&GetCostDiff = + [this, E, VL, CommonCost, + VL0](function_ref ScalarEltCost, + function_ref VectorCost) { + // Calculate the cost of this instruction. + InstructionCost ScalarCost = 0; + if (isa(VL0)) { + // For some of the instructions no need to calculate cost for each + // particular instruction, we can use the cost of the single + // instruction x total number of scalar instructions. + ScalarCost = VL.size() * + ScalarEltCost(std::distance(VL.begin(), find(VL, VL0))); + } else { + for (unsigned I = 0, Sz = VL.size(); I < Sz; ++I) + ScalarCost += ScalarEltCost(I); + } + + InstructionCost VecCost = VectorCost(CommonCost); + LLVM_DEBUG( + dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost)); + // Disable warnings for `this` and `E` are unused. Required for + // `dumpTreeCosts`. + (void)this; + (void)E; + return VecCost - ScalarCost; + }; switch (ShuffleOrOp) { - case Instruction::PHI: - return 0; + case Instruction::PHI: { + // Count reused scalars. + InstructionCost ScalarCost = 0; + SmallPtrSet CountedOps; + for (Value *V : VL) { + auto *PHI = dyn_cast(V); + if (!PHI) + continue; - case Instruction::ExtractValue: - case Instruction::ExtractElement: { - // The common cost of removal ExtractElement/ExtractValue instructions + - // the cost of shuffles, if required to resuffle the original vector. - if (NeedToShuffleReuses) { - unsigned Idx = 0; - for (unsigned I : E->ReuseShuffleIndices) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast(VL[I]); - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), - *getExtractIndex(EE)); - } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - ++Idx; - } - } - Idx = EntryVF; - for (Value *V : VL) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast(V); - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), - *getExtractIndex(EE)); - } else { - --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - } - } - } - if (ShuffleOrOp == Instruction::ExtractValue) { - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - auto *EI = cast(VL[I]); - // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - CommonCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; - } - } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); - } - } else { - AdjustExtractsCost(CommonCost); + ValueList Operands(PHI->getNumIncomingValues(), nullptr); + for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { + Value *Op = PHI->getIncomingValue(I); + Operands[I] = Op; } - return CommonCost; + if (const TreeEntry *OpTE = getTreeEntry(Operands.front())) + if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second) + if (!OpTE->ReuseShuffleIndices.empty()) + ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - + OpTE->Scalars.size()); } + + return CommonCost - ScalarCost; + } + case Instruction::ExtractValue: + case Instruction::ExtractElement: + return GetCostDiff( + [this, ShuffleOrOp, VL, ScalarTy, CostKind](unsigned Idx) { + auto *I = cast(VL[Idx]); + VectorType *SrcVecTy; + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(I); + SrcVecTy = EE->getVectorOperandType(); + } else { + auto *EV = cast(I); + Type *AggregateTy = EV->getAggregateOperand()->getType(); + unsigned NumElts; + if (auto *ATy = dyn_cast(AggregateTy)) + NumElts = ATy->getNumElements(); + else + NumElts = AggregateTy->getStructNumElements(); + SrcVecTy = FixedVectorType::get(ScalarTy, NumElts); + } + if (I->hasOneUse()) { + Instruction *Ext = I->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + InstructionCost Cost = TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), SrcVecTy, + *getExtractIndex(I)); + // Subtract the cost of s|zext which is subtracted separately. + Cost -= TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), I->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + return Cost; + } + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, + SrcVecTy, *getExtractIndex(I)); + }, + [](InstructionCost CommonCost) { return CommonCost; }); case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "Unique insertelements only are expected."); @@ -5699,79 +5726,76 @@ case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - InstructionCost ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - - // Calculate the cost of this instruction. - InstructionCost ScalarCost = VL.size() * ScalarEltCost; - - auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - InstructionCost VecCost = 0; - // Check if the values are candidates to demote. - if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = CommonCost + TTI->getCastInstrCost( - E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return VecCost - ScalarCost; - } + case Instruction::BitCast: + return GetCostDiff( + [this, E, ScalarTy, VL, CostKind](unsigned Idx) { + auto *VI = cast(VL[Idx]); + return TTI->getCastInstrCost( + E->getOpcode(), ScalarTy, VI->getOperand(0)->getType(), + TTI::getCastContextHint(VI), CostKind, VI); + }, + [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) { + Type *SrcTy = VL0->getOperand(0)->getType(); + auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); + InstructionCost VecCost = 0; + // Check if the values are candidates to demote. + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { + VecCost = CommonCost + + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), + CostKind, VL0); + } + return VecCost; + }); case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { - // Calculate the cost of this instruction. - InstructionCost ScalarEltCost = - TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - - // Check if all entries in VL are either compares or selects with compares - // as condition that have the same predicates. - CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; - bool First = true; - for (auto *V : VL) { - CmpInst::Predicate CurrentPred; - auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); - if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && - !match(V, MatchCmp)) || - (!First && VecPred != CurrentPred)) { - VecPred = CmpInst::BAD_ICMP_PREDICATE; - break; - } - First = false; - VecPred = CurrentPred; - } - - InstructionCost VecCost = TTI->getCmpSelInstrCost( - E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); - // Check if it is possible and profitable to use min/max for selects in - // VL. - // - auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); - if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { - IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, - {VecTy, VecTy}); - InstructionCost IntrinsicCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - // If the selects are the only uses of the compares, they will be dead - // and we can adjust the cost by removing their cost. - if (IntrinsicAndUse.second) - IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, - MaskTy, VecPred, CostKind); - VecCost = std::min(VecCost, IntrinsicCost); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; + CmpInst::Predicate VecPred, SwappedVecPred; + auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); + if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || + match(VL0, MatchCmp)) + SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); + else + SwappedVecPred = VecPred = CmpInst::BAD_ICMP_PREDICATE; + return GetCostDiff( + [this, E, ScalarTy, VL, CostKind, &VecPred, + &SwappedVecPred](unsigned Idx) { + auto *VI = cast(VL[Idx]); + CmpInst::Predicate CurrentPred; + auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && + !match(VI, MatchCmp)) || + (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) + VecPred = SwappedVecPred = CmpInst::BAD_ICMP_PREDICATE; + + return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), CurrentPred, + CostKind, VI); + }, + [this, VL, VL0, VecTy, E, CostKind, + &VecPred](InstructionCost CommonCost) { + auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); + + InstructionCost VecCost = TTI->getCmpSelInstrCost( + E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); + // Check if it is possible and profitable to use min/max for selects + // in VL. + // + auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); + if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, + {VecTy, VecTy}); + InstructionCost IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + // If the selects are the only uses of the compares, they will be + // dead and we can adjust the cost by removing their cost. + if (IntrinsicAndUse.second) + IntrinsicCost -= TTI->getCmpSelInstrCost( + Instruction::ICmp, VecTy, MaskTy, VecPred, CostKind); + VecCost = std::min(VecCost, IntrinsicCost); + } + return VecCost + CommonCost; + }); } case Instruction::FNeg: case Instruction::Add: @@ -5791,137 +5815,125 @@ case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::GetElementPtr: { + unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr + ? Instruction::Add + : ShuffleOrOp; // Certain instructions can be cheaper to vectorize if they have a // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = TargetTransformInfo::OP_PowerOf2; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - unsigned OpIdx = isa(I) ? 1 : 0; - ConstantInt *CInt = dyn_cast(I->getOperand(OpIdx)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; - } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - - SmallVector Operands(VL0->operand_values()); - InstructionCost ScalarEltCost = - TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = - TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - - InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( - Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = TTI->getArithmeticInstrCost( - Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - Align Alignment = cast(VL0)->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecLdCost; - if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, - CostKind, VL0); - } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); - Align CommonAlignment = Alignment; - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast(V)->getAlign()); - VecLdCost = TTI->getGatherScatterOpCost( - Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, VL0); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); - return CommonCost + VecLdCost - ScalarLdCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = - cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); - Align Alignment = SI->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); - return CommonCost + VecStCost - ScalarStCost; + return GetCostDiff( + [this, ScalarTy, VL, CostKind, &Op2VK, &Op2VP, &CInt0, + Opcode](unsigned Idx) { + auto *VI = cast(VL[Idx]); + unsigned OpIdx = isa(VI) ? 1 : 0; + TargetTransformInfo::OperandValueKind ScalarOp2VK = + TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties ScalarOp2VP = + TargetTransformInfo::OP_PowerOf2; + ConstantInt *CInt = dyn_cast(VI->getOperand(OpIdx)); + if (!CInt) { + ScalarOp2VK = Op2VK = TargetTransformInfo::OK_AnyValue; + ScalarOp2VP = Op2VP = TargetTransformInfo::OP_None; + } else { + if (!CInt->getValue().isPowerOf2()) + ScalarOp2VP = Op2VP = TargetTransformInfo::OP_None; + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue) { + if (!CInt0) + CInt0 = CInt; + else if (CInt0 != CInt) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } + } + SmallVector Operands(VI->operand_values()); + return TTI->getArithmeticInstrCost( + Opcode, ScalarTy, CostKind, TargetTransformInfo::OK_AnyValue, + ScalarOp2VK, TargetTransformInfo::OP_None, ScalarOp2VP, + Operands, VI); + }, + [this, VecTy, CostKind, E, VL0, &Op2VK, &Op2VP, + Opcode](InstructionCost CommonCost) { + if (Op2VK == TargetTransformInfo::OK_AnyValue && + isSplat(E->getOperand( + isa(VL0) ? 1 : 0))) + Op2VK = TargetTransformInfo::OK_UniformValue; + return TTI->getArithmeticInstrCost( + Opcode, VecTy, CostKind, + TargetTransformInfo::OK_AnyValue, Op2VK, + TargetTransformInfo::OP_None, Op2VP) + + CommonCost; + }); } + case Instruction::Load: + return GetCostDiff( + [this, ScalarTy, VL, CostKind](unsigned Idx) { + auto *VI = cast(VL[Idx]); + return TTI->getMemoryOpCost( + Instruction::Load, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, VI); + }, + [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) { + auto *LI0 = cast(VL0); + InstructionCost VecLdCost; + if (E->State == TreeEntry::Vectorize) { + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, VecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind); + } else { + assert(E->State == TreeEntry::ScatterVectorize && + "Unknown EntryState"); + Align CommonAlignment = LI0->getAlign(); + for (Value *V : VL) + CommonAlignment = commonAlignment( + CommonAlignment, cast(V)->getAlign()); + VecLdCost = TTI->getGatherScatterOpCost( + Instruction::Load, VecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind); + } + return VecLdCost + CommonCost; + }); + case Instruction::Store: + return GetCostDiff( + [this, ScalarTy, VL, CostKind](unsigned Idx) { + auto *VI = cast(VL[Idx]); + return TTI->getMemoryOpCost( + Instruction::Store, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, VI); + }, + [this, VL, VL0, VecTy, E, CostKind](InstructionCost CommonCost) { + // We know that we can merge the stores. Calculate the cost. + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast(IsReorder ? VL[E->ReorderIndices.front()] + : VL0); + return TTI->getMemoryOpCost( + Instruction::Store, VecTy, SI->getAlign(), + SI->getPointerAddressSpace(), CostKind) + + CommonCost; + }); case Instruction::Call: { - CallInst *CI = cast(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1); - InstructionCost ScalarEltCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - - auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); - InstructionCost VecCallCost = - std::min(VecCallCosts.first, VecCallCosts.second); - - LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); - - return CommonCost + VecCallCost - ScalarCallCost; + return GetCostDiff( + [this, VL, CostKind](unsigned Idx) { + auto *CI = cast(VL[Idx]); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(ID, *CI, 1); + return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + } + return TTI->getCallInstrCost( + CI->getCalledFunction(), CI->getFunctionType()->getReturnType(), + CI->getFunctionType()->params(), CostKind); + }, + [this, VL0, VecTy](InstructionCost CommonCost) { + auto *CI = cast(VL0); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + return std::min(VecCallCosts.first, VecCallCosts.second) + + CommonCost; + }); } case Instruction::ShuffleVector: { assert(E->isAltShuffle() && @@ -5931,25 +5943,6 @@ Instruction::isCast(E->getAltOpcode())) || (isa(VL0) && isa(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); - InstructionCost ScalarCost = 0; - if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); - CommonCost -= TTI->getInstructionCost(I, CostKind); - } - for (Value *V : VL) { - Instruction *I = cast(V); - CommonCost += TTI->getInstructionCost(I, CostKind); - } - } - for (Value *V : VL) { - Instruction *I = cast(V); - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - ScalarCost += TTI->getInstructionCost(I, CostKind); - } - // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. - InstructionCost VecCost = 0; // Try to find the previous shuffle node with the same operands and same // main/alternate ops. auto &&TryFindNodeWithEqualOperands = [this, E]() { @@ -5966,53 +5959,71 @@ } return false; }; - if (TryFindNodeWithEqualOperands()) { - LLVM_DEBUG({ - dbgs() << "SLP: diamond match for alternate node found.\n"; - E->dump(); - }); - // No need to add new vector costs here since we're going to reuse - // same main/alternate vector ops, just do different shuffling. - } else if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); - } else if (auto *CI0 = dyn_cast(VL0)) { - VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), - CI0->getPredicate(), CostKind, VL0); - VecCost += TTI->getCmpSelInstrCost( - E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - cast(E->getAltOp())->getPredicate(), CostKind, - E->getAltOp()); - } else { - Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); - Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); - auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); - auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, + // Need to clear CommonCost since the final shuffle cost is included into + // vector cost. + return GetCostDiff( + [this, VL, CostKind, E](unsigned Idx) { + auto *VI = cast(VL[Idx]); + assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); + (void)E; + return TTI->getInstructionCost(VI, CostKind); + }, + [this, VecTy, E, CostKind, VL, FinalVecTy, + &TryFindNodeWithEqualOperands, VL0, ScalarTy](InstructionCost) { + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + InstructionCost VecCost = 0; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { + VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, + CostKind); + } else if (auto *CI0 = dyn_cast(VL0)) { + VecCost = TTI->getCmpSelInstrCost( + E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + CI0->getPredicate(), CostKind, VL0); + VecCost += TTI->getCmpSelInstrCost( + E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + cast(E->getAltOp())->getPredicate(), CostKind, + E->getAltOp()); + } else { + Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); + Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); + auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); + auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); + VecCost = + TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, TTI::CastContextHint::None, CostKind); - VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, - TTI::CastContextHint::None, CostKind); - } - - if (E->ReuseShuffleIndices.empty()) { - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); - } else { - SmallVector Mask; - buildShuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); - }, - Mask); - CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; + VecCost += + TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, + TTI::CastContextHint::None, CostKind); + } + SmallVector Mask; + if (E->ReuseShuffleIndices.empty()) { + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, + FinalVecTy); + } else { + SmallVector Mask; + buildShuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && + "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + VecCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, FinalVecTy, Mask); + } + return VecCost; + }); } default: llvm_unreachable("Unknown instruction"); @@ -6445,11 +6456,6 @@ if (isa(EU.Scalar->getType())) continue; - // Already counted the cost for external uses when tried to adjust the cost - // for extractelements, no need to add it again. - if (isa(EU.Scalar)) - continue; - // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null(EU.User)) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll @@ -107,25 +107,22 @@ define void @select_ule_ugt_mix_4xi32(i32* %ptr, i32 %x) { ; CHECK-LABEL: @select_ule_ugt_mix_4xi32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4 -; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i32 [[L_0]], 16383 -; CHECK-NEXT: [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 16383 -; CHECK-NEXT: store i32 [[S_0]], i32* [[PTR]], align 4 -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1 -; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp ugt i32 [[L_1]], 16383 -; CHECK-NEXT: [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 16383 -; CHECK-NEXT: store i32 [[S_1]], i32* [[GEP_1]], align 4 -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4 -; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i32 [[L_2]], 16383 -; CHECK-NEXT: [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 16383 -; CHECK-NEXT: store i32 [[S_2]], i32* [[GEP_2]], align 4 -; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3 -; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4 -; CHECK-NEXT: [[CMP_3:%.*]] = icmp ugt i32 [[L_3]], 16383 -; CHECK-NEXT: [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 16383 -; CHECK-NEXT: store i32 [[S_3]], i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> , i32 [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[PTR]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -53,13 +53,17 @@ define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_extract( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3 -; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 +; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8 +; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[LD]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -33,8 +33,11 @@ ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> , i32 8, <2 x i1> , <2 x i32> undef) -; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -104,24 +104,20 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( @@ -258,44 +254,36 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( @@ -469,44 +457,36 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX512F-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -104,24 +104,20 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( @@ -258,44 +254,36 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( @@ -469,44 +457,36 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX512F-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll @@ -8,21 +8,22 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP1]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP1]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP1]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP1]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64 +; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TTMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TTMP4]], i32 [[TMP4]], i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX1]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]] +; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64 ; CHECK-NEXT: ret i64 [[TMP65]] ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -4,20 +4,32 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]] +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]] +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4 +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1