diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1154,8 +1154,11 @@ /// \return The expected cost of vector Insert and Extract. /// Use -1 to indicate that there is no information on the index value. + /// Caller can provide the instruction that holds Opcode in `I`, or use + /// nullptr to indcate there is no instruction information. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index = -1) const; + unsigned Index = -1, + const Instruction *I = nullptr) const; /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. @@ -1722,7 +1725,8 @@ TTI::TargetCostKind CostKind, const Instruction *I) = 0; virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) = 0; + unsigned Index, + const Instruction *I) = 0; virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, @@ -2267,9 +2271,9 @@ const Instruction *I) override { return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) override { - return Impl.getVectorInstrCost(Opcode, Val, Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I) override { + return Impl.getVectorInstrCost(Opcode, Val, Index, I); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -568,8 +568,8 @@ return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I) const { return 1; } @@ -1139,7 +1139,7 @@ if (auto *CI = dyn_cast(IE->getOperand(2))) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); - return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx); + return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx, I); } case Instruction::ShuffleVector: { auto *Shuffle = dyn_cast(U); @@ -1229,7 +1229,7 @@ if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); Type *DstTy = U->getOperand(0)->getType(); - return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx); + return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx, I); } } // By default, just classify everything as 'basic'. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -88,10 +88,12 @@ InstructionCost Cost = 0; // Broadcast cost is equal to the cost of extracting the zero'th element // plus the cost of inserting it into every element of the result vector. - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0, + nullptr); for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, + nullptr); } return Cost; } @@ -108,8 +110,10 @@ // vector and finally index 3 of second vector and insert them at index // <0,1,2,3> of result vector. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, + nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i, + nullptr); } return Cost; } @@ -132,9 +136,9 @@ // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, - i + Index); - Cost += - thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); + i + Index, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i, + nullptr); } return Cost; } @@ -156,10 +160,10 @@ // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { - Cost += - thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, + i, nullptr); Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, - i + Index); + i + Index, nullptr); } return Cost; } @@ -210,7 +214,7 @@ FixedVectorType::get( PointerType::get(VT->getElementType(), 0), VT->getNumElements()), - -1) + -1, nullptr) : 0; InstructionCost LoadCost = VT->getNumElements() * @@ -235,7 +239,7 @@ Instruction::ExtractElement, FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VT->getNumElements()), - -1) + + -1, nullptr) + getCFInstrCost(Instruction::Br, CostKind) + getCFInstrCost(Instruction::PHI, CostKind)); } @@ -716,9 +720,11 @@ if (!DemandedElts[i]) continue; if (Insert) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i, + nullptr); if (Extract) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i, + nullptr); } return Cost; @@ -1083,8 +1089,9 @@ InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { + // FIXME: Pass instruction pointer. return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index) + + Index, nullptr) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), TTI::CastContextHint::None, TTI::TCK_RecipThroughput); @@ -1146,8 +1153,8 @@ return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I = nullptr) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); @@ -2205,7 +2212,8 @@ ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, + nullptr); } /// Try to calculate the cost of performing strict (in-order) reductions, @@ -2311,7 +2319,8 @@ // The last min/max should be in vector registers and we counted it above. // So just need a single extractelement. return ShuffleCost + MinMaxCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, + nullptr); } InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -860,10 +860,9 @@ return Cost; } -InstructionCost TargetTransformInfo::getVectorInstrCost(unsigned Opcode, - Type *Val, - unsigned Index) const { - InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index); +InstructionCost TargetTransformInfo::getVectorInstrCost( + unsigned Opcode, Type *Val, unsigned Index, const Instruction *I) const { + InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7229,8 +7229,8 @@ // The scalar chain of computation has to pay for the transition // scalar to vector. // The vector chain has to account for the combining cost. - InstructionCost ScalarCost = - TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); + InstructionCost ScalarCost = TTI.getVectorInstrCost( + Transition->getOpcode(), PromotedType, Index, Transition); InstructionCost VectorCost = StoreExtractCombineCost; enum TargetTransformInfo::TargetCostKind CostKind = TargetTransformInfo::TCK_RecipThroughput; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -173,8 +173,8 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1776,8 +1776,11 @@ // Get the cost for the extract. We compute the cost (if any) for the extend // below. + // + // FIXME: Change signature of `getExtractWithExtendCost` so caller + // can optionally provide the extract-element instruction as context. InstructionCost Cost = - getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); + getVectorInstrCost(Instruction::ExtractElement, VecTy, Index, nullptr); // Legalize the types. auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); @@ -1830,7 +1833,8 @@ } InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -1848,8 +1852,45 @@ Index = Index % Width; } - // The element at index zero is already inside the vector. - if (Index == 0) + // FIXME: Use `IsExtractedElementUsedAsInteger` to decide cost when index is + // explicit (i.e., not -1, not limited to 0). + auto IsExtractedElementUsedAsInteger = + [Val](const Instruction *Inst) -> bool { + if (!isa_and_nonnull(Inst) || + !Val->getScalarType()->isIntegerTy()) + return false; + + // Inst is an ExtractElementInst and element type in the vector is + // integer. + bool UsedAsInteger = false; + for (const Use &U : Inst->uses()) { + Instruction *UserI = dyn_cast(U.getUser()); + + assert(UserI && "All users of an instruction should be instructions"); + + // If the extracted element is used in a store operation, it must be the + // value (not pointer); and it could be stored bit-wise without an + // explicit conversion to integer. + if (isa(UserI)) + continue; + + // If used as a floating point value, users can access the subregister directly. + if (isa(UserI) || isa(UserI)) + continue; + + // For the rest of cases, assume that the extracted element will be used + // as a scalar/integer. + UsedAsInteger = true; + break; + } + return UsedAsInteger; + }; + + // For extract_element instruction, the element at index zero is already + // inside the floating-point subregister. + // + // If element type is integer, an explicit move operation will be codegen'd. + if (Index == 0 && !IsExtractedElementUsedAsInteger(I)) return 0; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -160,7 +160,7 @@ ArrayRef Indices = {}) const; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index); + unsigned Index, const Instruction *I); bool isSourceOfDivergence(const Value *V) const; bool isAlwaysUniform(const Value *V) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -789,7 +789,8 @@ } InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { @@ -798,7 +799,7 @@ if (EltSize < 32) { if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) return 0; - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -809,7 +810,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -61,7 +61,7 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index); + unsigned Index, const Instruction *I); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -108,14 +108,15 @@ } InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { unsigned EltSize = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); if (EltSize < 32) { - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -126,7 +127,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -237,8 +237,8 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -872,7 +872,8 @@ } InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && @@ -891,7 +892,7 @@ if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return std::max( - BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); + BaseT::getVectorInstrCost(Opcode, ValTy, Index, I), 2U); } if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || @@ -904,7 +905,7 @@ return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -151,8 +151,8 @@ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -306,7 +306,8 @@ } InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() : Val; if (Opcode == Instruction::InsertElement) { @@ -315,7 +316,8 @@ if (ElemTy->isIntegerTy(32)) return Cost; // If it's not a 32-bit value, there will need to be an extract. - return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); + return Cost + + getVectorInstrCost(Instruction::ExtractElement, Val, Index, I); } if (Opcode == Instruction::ExtractElement) diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -123,8 +123,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1071,7 +1071,8 @@ } InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { assert(Val->isVectorTy() && "This must be a vector type"); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -1081,7 +1082,7 @@ if (!CostFactor.isValid()) return InstructionCost::getMax(); - InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); + InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index, I); Cost *= CostFactor; if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { @@ -1146,7 +1147,6 @@ unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) { - InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); if (!CostFactor.isValid()) return InstructionCost::getMax(); @@ -1222,7 +1222,7 @@ if (Src->isVectorTy() && Opcode == Instruction::Store) for (int i = 0, e = cast(Src)->getNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, I); return Cost; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -107,8 +107,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -997,7 +997,8 @@ } InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { // vlvgp will insert two grs into a vector register, so only count half the // number of instructions. if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64)) @@ -1013,7 +1014,7 @@ return Cost; } - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return BaseT::getVectorInstrCost(Opcode, Val, Index, I); } // Check if a load may be folded as a memory operand in its user. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -67,8 +67,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); /// @} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -84,9 +84,10 @@ InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { InstructionCost Cost = - BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index); + BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, I); // SIMD128's insert/extract currently only take constant indices. if (Index == -1u) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -146,8 +146,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3644,7 +3644,8 @@ } InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { static const CostTblEntry SLMCostTbl[] = { { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, @@ -3772,7 +3773,8 @@ if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) RegisterFileMoveCost += 1; - return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; + return BaseT::getVectorInstrCost(Opcode, Val, Index, I) + + RegisterFileMoveCost; } InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, @@ -3901,7 +3903,8 @@ for (unsigned I = 0; I != NumElts; ++I) if (WidenedDemandedElts[I]) { unsigned Idx = I % Scale; - Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx); + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx, + nullptr); } return Cost; @@ -4512,7 +4515,8 @@ } // Add the final extract element to the cost. - return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + return ReductionCost + + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr); } InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, @@ -4813,7 +4817,8 @@ } // Add the final extract element to the cost. - return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + return MinMaxCost + + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr); } /// Calculate the cost of materializing a 64-bit value. This helper diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6405,6 +6405,7 @@ } StoreInst *SI = cast(I); + // FIXME: Use a specific type rather than unknown. bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, @@ -6412,7 +6413,7 @@ (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + VF.getKnownMinValue() - 1, nullptr)); } InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5874,7 +5874,7 @@ } } Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), Idx); + EE->getVectorOperandType(), Idx, EE); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6109,10 +6109,11 @@ auto *EE = cast(VL[I]); CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + *getExtractIndex(EE), EE); } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + CommonCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx, + dyn_cast(VL[I])); ++Idx; } } @@ -6122,11 +6123,12 @@ auto *EE = cast(V); CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + *getExtractIndex(EE), EE); } else { --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + CommonCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx, + dyn_cast(V)); } } } @@ -6150,8 +6152,8 @@ continue; } } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, I, EI); } } else { AdjustExtractsCost(CommonCost); @@ -7142,8 +7144,8 @@ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + EU.Lane, nullptr); } } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -271,9 +271,9 @@ Type *VecTy = Ext0->getVectorOperand()->getType(); assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); InstructionCost Cost0 = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0); InstructionCost Cost1 = - TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1); // If both costs are invalid no shuffle is needed if (!Cost0.isValid() && !Cost1.isValid()) @@ -337,10 +337,10 @@ unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); - InstructionCost Extract0Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); - InstructionCost Extract1Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); + InstructionCost Extract0Cost = TTI.getVectorInstrCost( + Instruction::ExtractElement, VecTy, Ext0Index, Ext0); + InstructionCost Extract1Cost = TTI.getVectorInstrCost( + Instruction::ExtractElement, VecTy, Ext1Index, Ext1); // A more expensive extract will always be replaced by a splat shuffle. // For example, if Ext0 is more expensive: @@ -665,7 +665,7 @@ // Get cost estimate for the insert element. This cost will factor into // both sequences. InstructionCost InsertCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index, &I); InstructionCost OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; InstructionCost NewCost = ScalarOpCost + InsertCost + @@ -755,8 +755,8 @@ return false; InstructionCost OldCost = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0); + OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * @@ -776,7 +776,7 @@ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); - NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex); + NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex, Ext0); // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. @@ -1036,7 +1036,7 @@ auto *Index = dyn_cast(UI->getOperand(1)); OriginalCost += TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(), - Index ? Index->getZExtValue() : -1); + Index ? Index->getZExtValue() : -1, LI); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), Align(1), LI->getPointerAddressSpace()); diff --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll --- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll +++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll @@ -21,26 +21,22 @@ ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 - ret void } ; CHECK-LABEL: vectorInstrExtractCost define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) { - - ; Vector extracts - extracting each element at index 0 is considered - ; free in the current implementation. When extracting element at index - ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as - ; well. - ; ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1 - ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2 + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 2 %t1 = extractelement <4 x i64> %vecreg, i32 1 %t2 = extractelement <4 x i64> %vecreg, i32 2 %ele = add i64 %t2, 1 %cond = icmp eq i64 %t1, %ele - ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0 + ; Vector extracts - extracting each element should have a cost + ; if they are used as integers. + ; + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 0 ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3 %t0 = extractelement <4 x i64> %vecreg, i32 0 %t3 = extractelement <4 x i64> %vecreg, i32 3 diff --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll --- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll +++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll @@ -18,24 +18,23 @@ ; CHECK-NEXT: [[TMP12]] = add i64 [[TMP4]], 1 ; CHECK-NEXT: br label [[TMP3]] ; CHECK: .split.loop.exit: -; CHECK-NEXT: [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ] ; CHECK-NEXT: [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ] ; CHECK-NEXT: [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]] -; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]] +; CHECK-NEXT: br label [[TMP16:%.*]] ; CHECK: .split.loop.exit2: ; CHECK-NEXT: [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ] ; CHECK-NEXT: [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ] -; CHECK-NEXT: br label [[TMP17]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ] -; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP18]], true -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]] -; CHECK-NEXT: ret i1 [[TMP21]] +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ] +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]] +; CHECK-NEXT: ret i1 [[TMP20]] ; br label %3