diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1079,6 +1079,11 @@ VectorType *SubTp = nullptr, ArrayRef Args = None) const; + /// \Returns the cost of an add-sub instruction. This corresponds to a shuffle + /// + fadd + fsub pattern in IR. + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const; + /// Represents a hint about the context in which a cast is used. /// /// For zext/sext, the context of the cast is the operand, which must be a @@ -1700,6 +1705,9 @@ ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args) = 0; + virtual InstructionCost + getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const = 0; virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -2237,6 +2245,11 @@ ArrayRef Args) override { return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args); } + InstructionCost + getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const override { + return Impl.getAltInstrCost(VecTy, ShuffleMask, Args); + } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -503,6 +503,11 @@ return 1; } + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const { + return 1; + } + InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -765,6 +765,15 @@ return Cost; } +InstructionCost +TargetTransformInfo::getAltInstrCost(VectorType *VecTy, + ArrayRef ShuffleMask, + ArrayRef Args) const { + InstructionCost Cost = TTIImpl->getAltInstrCost(VecTy, ShuffleMask, Args); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -138,6 +138,8 @@ ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args = None); + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const; InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1638,6 +1638,34 @@ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } +InstructionCost +X86TTIImpl::getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const { + assert(cast(VecTy)->getNumElements() == ShuffleMask.size() && + "Expected same number of lanes"); + assert(Args.size() == 2 && isa(Args[0]) && + isa(Args[1]) && "Not an AddSub instruction"); +#ifndef NDEBUG + // For the Add-sub pattern we need the operands of the operands to be same: + unsigned Opcode0 = cast(Args[0])->getOpcode(); + unsigned Opcode1 = cast(Args[1])->getOpcode(); + SmallBitVector OpcodeMask(ShuffleMask.size(), 0); + unsigned NumLanes = ShuffleMask.size(); + for (unsigned Lane : seq(0, NumLanes)) + if (ShuffleMask[Lane] >= (int)NumLanes) + OpcodeMask.set(Lane); + assert(isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) && + "We can only accept patterns matched by isLegalAltInstr()"); +#endif + if (ST->hasSSE3()) + return 2; // 2xf32 addsubps, 2xf64 addsubpd + if (ST->hasAVX()) + return 1; // 4xf32 vaddsubps, 2xf64 vaddsubpd + if (ST->hasAVX2()) + return 1; // 8xf32 vaddsubps, 4xf64 vaddsubpd + llvm_unreachable("Should have been rejected by isLegalAltInstr()"); +} + InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -939,6 +939,10 @@ /// insertelement nodes, otherwise skip them. Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// \Returns true if \p TE is an alt-shuffle that can be lowered to a single + /// instruction. An example of this is the X86 addsub instruction. + bool isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const; + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -3601,6 +3605,22 @@ return None; } +bool BoUpSLP::isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const { + if (!TE->isAltShuffle()) + return false; + VectorType *VecTy = + FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); + unsigned Opcode0 = TE->getOpcode(); + unsigned Opcode1 = TE->getAltOpcode(); + // The opcode mask selects between the two opcodes. + SmallBitVector OpcodeMask(TE->Scalars.size(), 0); + for (unsigned Lane : seq(0, TE->Scalars.size())) + if (cast(TE->Scalars[Lane])->getOpcode() == Opcode1) + OpcodeMask.set(Lane); + // If this pattern is supported by the target then we consider the order. + return TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask); +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -3633,21 +3653,9 @@ // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->isAltShuffle()) { - VectorType *VecTy = - FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); - unsigned Opcode0 = TE->getOpcode(); - unsigned Opcode1 = TE->getAltOpcode(); - // The opcode mask selects between the two opcodes. - SmallBitVector OpcodeMask(TE->Scalars.size(), 0); - for (unsigned Lane : seq(0, TE->Scalars.size())) - if (cast(TE->Scalars[Lane])->getOpcode() == Opcode1) - OpcodeMask.set(Lane); - // If this pattern is supported by the target then we consider the order. - if (TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); - } + if (isAltShuffleThatLowersToOneInstr(TE.get())) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); // TODO: Check the reverse order too. } @@ -6267,8 +6275,10 @@ ScalarCost += TTI->getInstructionCost(I, CostKind); } // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. + // and the cost of creating shuffle, except if this is an alternate + // sequence that can be lowered to a single instruction, like x86 addsub. InstructionCost VecCost = 0; + bool LowersToOneAltInstr = isAltShuffleThatLowersToOneInstr(E); // Try to find the previous shuffle node with the same operands and same // main/alternate ops. auto &&TryFindNodeWithEqualOperands = [this, E]() { @@ -6293,9 +6303,12 @@ // No need to add new vector costs here since we're going to reuse // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); + if (!LowersToOneAltInstr) { + VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += + TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } } else if (auto *CI0 = dyn_cast(VL0)) { VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), @@ -6315,10 +6328,7 @@ TTI::CastContextHint::None, CostKind); } - if (E->ReuseShuffleIndices.empty()) { - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); - } else { + if (LowersToOneAltInstr) { SmallVector Mask; buildShuffleEntryMask( E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, @@ -6327,8 +6337,25 @@ return I->getOpcode() == E->getAltOpcode(); }, Mask); - CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + ArrayRef Args = {E->getMainOp(), E->getAltOp()}; + CommonCost = TTI->getAltInstrCost(FinalVecTy, Mask, Args); + } else { + if (E->ReuseShuffleIndices.empty()) { + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + } else { + SmallVector Mask; + buildShuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && + "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, FinalVecTy, Mask); + } } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost;