diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1077,6 +1077,11 @@ VectorType *SubTp = nullptr, ArrayRef Args = None) const; + /// \Returns the cost of an add-sub instruction. This corresponds to a shuffle + /// + fadd + fsub pattern in IR. + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const; + /// Represents a hint about the context in which a cast is used. /// /// For zext/sext, the context of the cast is the operand, which must be a @@ -1698,6 +1703,9 @@ ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args) = 0; + virtual InstructionCost + getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const = 0; virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -2236,6 +2244,10 @@ ArrayRef Args) override { return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args); } + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const override { + return Impl.getAltInstrCost(VecTy, ShuffleMask, Args); + } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -503,6 +503,11 @@ return 1; } + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const { + return 1; + } + InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -766,6 +766,15 @@ return Cost; } +InstructionCost +TargetTransformInfo::getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const { + InstructionCost Cost = TTIImpl->getAltInstrCost(VecTy, ShuffleMask, Args); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -138,6 +138,8 @@ ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args = None); + InstructionCost getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const; InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1638,6 +1638,25 @@ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } +InstructionCost +X86TTIImpl::getAltInstrCost(VectorType *VecTy, ArrayRef ShuffleMask, + ArrayRef Args) const { + assert(Args.size() == 2 && isa(Args[0]) && + isa(Args[1]) && "Not an AddSub instruction"); + // For the Add-sub pattern we need the operands of the operands to be same: + unsigned Opcode1 = cast(Args[0])->getOpcode(); + unsigned Opcode2 = cast(Args[1])->getOpcode(); + assert(isLegalAltInstr(VecTy, Opcode1, Opcode2, ShuffleMask) && + "We can only accept patterns matched by isLegalAltInstr()"); + if (ST->hasSSE3()) + return 2; // 2xf32 addsubps, 2xf64 addsubpd + if (ST->hasAVX()) + return 1; // 4xf32 vaddsubps, 2xf64 vaddsubpd + if (ST->hasAVX2()) + return 1; // 8xf32 vaddsubps, 4xf64 vaddsubpd + llvm_unreachable("Should have been rejected by isLegalAltInstr()"); +} + InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -939,6 +939,10 @@ /// insertelement nodes, otherwise skip them. Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// \Returns true if \p TE is an alt-shuffle that can be lowered to a single + /// instruction. An example of this is the X86 addsub instruction. + bool isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const; + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -3617,6 +3621,24 @@ return None; } +bool BoUpSLP::isAltShuffleThatLowersToOneInstr(const TreeEntry *TE) const { + if (!TE->isAltShuffleWithRepeatingEvenOddOpcodes()) + return false; + VectorType *VecTy = + FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); + unsigned EvenOpcode = TE->getOpcode(); + unsigned OddOpcode = TE->getAltOpcode(); + // We have already checked that it is a repeating EvenOpcode/OddOpcode + // pattern so we create the ShuffleMask for such a pattern. + SmallVector ShuffleMask; + unsigned NumLanes = cast(VecTy)->getNumElements(); + for (unsigned Lane : seq(0, NumLanes)) { + int Idx = Lane % 2 == 0 ? Lane : NumLanes + Lane; + ShuffleMask.push_back(Idx); + } + return TTI->isLegalAltInstr(VecTy, EvenOpcode, OddOpcode, ShuffleMask); +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -3649,23 +3671,9 @@ // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->isAltShuffleWithRepeatingEvenOddOpcodes()) { - VectorType *VecTy = - FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); - unsigned EvenOpcode = TE->getOpcode(); - unsigned OddOpcode = TE->getAltOpcode(); - // We have already checked that it is a repeating EvenOpcode/OddOpcode - // pattern so we create the ShuffleMask for such a pattern. - SmallVector ShuffleMask; - unsigned NumLanes = cast(VecTy)->getNumElements(); - for (unsigned Lane : seq(0, NumLanes)) { - int Idx = Lane % 2 == 0 ? Lane : NumLanes + Lane; - ShuffleMask.push_back(Idx); - } - if (TTI->isLegalAltInstr(VecTy, EvenOpcode, OddOpcode, ShuffleMask)) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); - } + if (isAltShuffleThatLowersToOneInstr(TE.get())) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); // TODO: Check the reverse order too. } @@ -6285,8 +6293,10 @@ ScalarCost += TTI->getInstructionCost(I, CostKind); } // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. + // and the cost of creating shuffle, except if this is an alternate + // sequence that can be lowered to a single instruction, like x86 addsub. InstructionCost VecCost = 0; + bool LowersToOneAltInstr = isAltShuffleThatLowersToOneInstr(E); // Try to find the previous shuffle node with the same operands and same // main/alternate ops. auto &&TryFindNodeWithEqualOperands = [this, E]() { @@ -6311,9 +6321,12 @@ // No need to add new vector costs here since we're going to reuse // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); + if (!LowersToOneAltInstr) { + VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += + TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } } else if (auto *CI0 = dyn_cast(VL0)) { VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), @@ -6333,10 +6346,7 @@ TTI::CastContextHint::None, CostKind); } - if (E->ReuseShuffleIndices.empty()) { - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); - } else { + if (LowersToOneAltInstr) { SmallVector Mask; buildShuffleEntryMask( E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, @@ -6345,8 +6355,25 @@ return I->getOpcode() == E->getAltOpcode(); }, Mask); - CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + ArrayRef Args = {VL[0], VL[1]}; + CommonCost = TTI->getAltInstrCost(FinalVecTy, Mask, Args); + } else { + if (E->ReuseShuffleIndices.empty()) { + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + } else { + SmallVector Mask; + buildShuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && + "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, FinalVecTy, Mask); + } } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost;