diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1140,8 +1140,11 @@ /// \return The expected cost of vector Insert and Extract. /// Use -1 to indicate that there is no information on the index value. + /// Caller can provide the instruction that holds Opcode in `I`, or use + /// nullptr to indcate there is no instruction information. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index = -1) const; + unsigned Index = -1, + const Instruction *I = nullptr) const; /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. @@ -1704,7 +1707,8 @@ TTI::TargetCostKind CostKind, const Instruction *I) = 0; virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) = 0; + unsigned Index, + const Instruction *I) = 0; virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, @@ -2243,9 +2247,9 @@ const Instruction *I) override { return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) override { - return Impl.getVectorInstrCost(Opcode, Val, Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I) override { + return Impl.getVectorInstrCost(Opcode, Val, Index, I); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -561,8 +561,8 @@ return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I) const { return 1; } @@ -1132,7 +1132,7 @@ if (auto *CI = dyn_cast(IE->getOperand(2))) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); - return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx); + return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx, I); } case Instruction::ShuffleVector: { auto *Shuffle = dyn_cast(U); @@ -1222,7 +1222,7 @@ if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); Type *DstTy = U->getOperand(0)->getType(); - return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx); + return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx, I); } } // By default, just classify everything as 'basic'. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -88,10 +88,12 @@ InstructionCost Cost = 0; // Broadcast cost is equal to the cost of extracting the zero'th element // plus the cost of inserting it into every element of the result vector. - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0, + nullptr); for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, + nullptr); } return Cost; } @@ -108,8 +110,10 @@ // vector and finally index 3 of second vector and insert them at index // <0,1,2,3> of result vector. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, + nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i, + nullptr); } return Cost; } @@ -132,9 +136,9 @@ // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, - i + Index); - Cost += - thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); + i + Index, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i, + nullptr); } return Cost; } @@ -156,10 +160,10 @@ // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { - Cost += - thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, + i, nullptr); Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, - i + Index); + i + Index, nullptr); } return Cost; } @@ -210,7 +214,7 @@ FixedVectorType::get( PointerType::get(VT->getElementType(), 0), VT->getNumElements()), - -1) + -1, nullptr) : 0; InstructionCost LoadCost = VT->getNumElements() * @@ -235,7 +239,7 @@ Instruction::ExtractElement, FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VT->getNumElements()), - -1) + + -1, nullptr) + getCFInstrCost(Instruction::Br, CostKind) + getCFInstrCost(Instruction::PHI, CostKind)); } @@ -714,9 +718,11 @@ if (!DemandedElts[i]) continue; if (Insert) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i, + nullptr); if (Extract) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i, + nullptr); } return Cost; @@ -1079,8 +1085,9 @@ InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { + // FIXME: Pass instruction pointer. return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index) + + Index, nullptr) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), TTI::CastContextHint::None, TTI::TCK_RecipThroughput); @@ -1139,8 +1146,8 @@ return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I = nullptr) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); @@ -2180,7 +2187,8 @@ ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, + nullptr); } /// Try to calculate the cost of performing strict (in-order) reductions, @@ -2286,7 +2294,8 @@ // The last min/max should be in vector registers and we counted it above. // So just need a single extractelement. return ShuffleCost + MinMaxCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, + nullptr); } InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -850,10 +850,9 @@ return Cost; } -InstructionCost TargetTransformInfo::getVectorInstrCost(unsigned Opcode, - Type *Val, - unsigned Index) const { - InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index); +InstructionCost TargetTransformInfo::getVectorInstrCost( + unsigned Opcode, Type *Val, unsigned Index, const Instruction *I) const { + InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7230,8 +7230,8 @@ // The scalar chain of computation has to pay for the transition // scalar to vector. // The vector chain has to account for the combining cost. - InstructionCost ScalarCost = - TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); + InstructionCost ScalarCost = TTI.getVectorInstrCost( + Transition->getOpcode(), PromotedType, Index, Transition); InstructionCost VectorCost = StoreExtractCombineCost; enum TargetTransformInfo::TargetCostKind CostKind = TargetTransformInfo::TCK_RecipThroughput; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -173,8 +173,8 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1777,8 +1777,11 @@ // Get the cost for the extract. We compute the cost (if any) for the extend // below. + // + // FIXME: Change signature of `getExtractWithExtendCost` so caller + // can optionally provide the extract-element instruction as context. InstructionCost Cost = - getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); + getVectorInstrCost(Instruction::ExtractElement, VecTy, Index, nullptr); // Legalize the types. auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); @@ -1831,7 +1834,8 @@ } InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -1849,8 +1853,45 @@ Index = Index % Width; } - // The element at index zero is already inside the vector. - if (Index == 0) + // FIXME: Use `IsExtractedElementUsedAsInteger` to decide cost when index is + // explicit (i.e., not -1, not limited to 0). + auto IsExtractedElementUsedAsInteger = + [Val](const Instruction *Inst) -> bool { + if (!isa_and_nonnull(Inst) || + !Val->getScalarType()->isIntegerTy()) + return false; + + // Inst is an ExtractElementInst and element type in the vector is + // integer. + bool UsedAsInteger = false; + for (const Use &U : Inst->uses()) { + Instruction *UserI = dyn_cast(U.getUser()); + + assert(UserI && "All users of an instruction should be instructions"); + + // If the extracted element is used in a store operation, it must be the + // value (not pointer); and it could be stored bit-wise without an + // explicit conversion to integer. + if (isa(UserI)) + continue; + + // If used as a floating point value, users can access the subregister directly. + if (isa(UserI) || isa(UserI)) + continue; + + // For the rest of cases, assume that the extracted element will be used + // as a scalar/integer. + UsedAsInteger = true; + break; + } + return UsedAsInteger; + }; + + // For extract_element instruction, the element at index zero is already + // inside the floating-point subregister. + // + // If element type is integer, an explicit move operation will be codegen'd. + if (Index == 0 && !IsExtractedElementUsedAsInteger(I)) return 0; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -160,7 +160,7 @@ ArrayRef Indices = {}) const; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index); + unsigned Index, const Instruction *I); bool isSourceOfDivergence(const Value *V) const; bool isAlwaysUniform(const Value *V) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -789,7 +789,8 @@ } InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { @@ -798,7 +799,7 @@ if (EltSize < 32) { if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) return 0; - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -809,7 +810,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -61,7 +61,7 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index); + unsigned Index, const Instruction *I); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -108,14 +108,15 @@ } InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { unsigned EltSize = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); if (EltSize < 32) { - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -126,7 +127,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -237,8 +237,8 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -872,7 +872,8 @@ } InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { + unsigned Index, + const Instruction *I) { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && @@ -891,7 +892,7 @@ if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return std::max( - BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); + BaseT::getVectorInstrCost(Opcode, ValTy, Index, I), 2U); } if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || @@ -904,7 +905,7 @@ return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } - return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + return BaseT::getVectorInstrCost(Opcode, ValTy, Index, I); } InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -151,8 +151,8 @@ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -306,7 +306,8 @@ } InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() : Val; if (Opcode == Instruction::InsertElement) { @@ -315,7 +316,8 @@ if (ElemTy->isIntegerTy(32)) return Cost; // If it's not a 32-bit value, there will need to be an extract. - return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); + return Cost + + getVectorInstrCost(Instruction::ExtractElement, Val, Index, I); } if (Opcode == Instruction::ExtractElement) diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -123,8 +123,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1071,7 +1071,8 @@ } InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { assert(Val->isVectorTy() && "This must be a vector type"); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -1081,7 +1082,7 @@ if (!CostFactor.isValid()) return InstructionCost::getMax(); - InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); + InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index, I); Cost *= CostFactor; if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { @@ -1146,7 +1147,6 @@ unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) { - InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); if (!CostFactor.isValid()) return InstructionCost::getMax(); @@ -1222,7 +1222,7 @@ if (Src->isVectorTy() && Opcode == Instruction::Store) for (int i = 0, e = cast(Src)->getNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, I); return Cost; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -107,8 +107,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -993,7 +993,8 @@ } InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { // vlvgp will insert two grs into a vector register, so only count half the // number of instructions. if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64)) @@ -1009,7 +1010,7 @@ return Cost; } - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return BaseT::getVectorInstrCost(Opcode, Val, Index, I); } // Check if a load may be folded as a memory operand in its user. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -67,8 +67,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); /// @} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -84,9 +84,10 @@ InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { InstructionCost Cost = - BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index); + BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, I); // SIMD128's insert/extract currently only take constant indices. if (Index == -1u) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -146,8 +146,8 @@ CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, + const Instruction *I); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3644,7 +3644,8 @@ } InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { + unsigned Index, + const Instruction *I) { static const CostTblEntry SLMCostTbl[] = { { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, @@ -3772,7 +3773,8 @@ if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) RegisterFileMoveCost += 1; - return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; + return BaseT::getVectorInstrCost(Opcode, Val, Index, I) + + RegisterFileMoveCost; } InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, @@ -3901,7 +3903,8 @@ for (unsigned I = 0; I != NumElts; ++I) if (WidenedDemandedElts[I]) { unsigned Idx = I % Scale; - Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx); + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx, + nullptr); } return Cost; @@ -4512,7 +4515,8 @@ } // Add the final extract element to the cost. - return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + return ReductionCost + + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr); } InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, @@ -4813,7 +4817,8 @@ } // Add the final extract element to the cost. - return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); + return MinMaxCost + + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, nullptr); } /// Calculate the cost of materializing a 64-bit value. This helper diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6422,6 +6422,7 @@ } StoreInst *SI = cast(I); + // FIXME: Use a specific type rather than unknown. bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, @@ -6429,7 +6430,7 @@ (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + VF.getKnownMinValue() - 1, nullptr)); } InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5821,7 +5821,7 @@ } } Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), Idx); + EE->getVectorOperandType(), Idx, EE); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6056,10 +6056,11 @@ auto *EE = cast(VL[I]); CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + *getExtractIndex(EE), EE); } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + CommonCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx, + dyn_cast(VL[I])); ++Idx; } } @@ -6069,11 +6070,12 @@ auto *EE = cast(V); CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + *getExtractIndex(EE), EE); } else { --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + CommonCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx, + dyn_cast(V)); } } } @@ -6097,8 +6099,8 @@ continue; } } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, I, EI); } } else { AdjustExtractsCost(CommonCost); @@ -7079,8 +7081,8 @@ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + EU.Lane, nullptr); } } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -271,9 +271,9 @@ Type *VecTy = Ext0->getVectorOperand()->getType(); assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); InstructionCost Cost0 = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0); InstructionCost Cost1 = - TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1); // If both costs are invalid no shuffle is needed if (!Cost0.isValid() && !Cost1.isValid()) @@ -337,10 +337,10 @@ unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); - InstructionCost Extract0Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); - InstructionCost Extract1Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); + InstructionCost Extract0Cost = TTI.getVectorInstrCost( + Instruction::ExtractElement, VecTy, Ext0Index, Ext0); + InstructionCost Extract1Cost = TTI.getVectorInstrCost( + Instruction::ExtractElement, VecTy, Ext1Index, Ext1); // A more expensive extract will always be replaced by a splat shuffle. // For example, if Ext0 is more expensive: @@ -661,7 +661,7 @@ // Get cost estimate for the insert element. This cost will factor into // both sequences. InstructionCost InsertCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index, &I); InstructionCost OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; InstructionCost NewCost = ScalarOpCost + InsertCost + @@ -750,8 +750,8 @@ return false; InstructionCost OldCost = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0, Ext0); + OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1, Ext1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * @@ -771,7 +771,7 @@ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); - NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex); + NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex, Ext0); // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. @@ -1031,7 +1031,7 @@ auto *Index = dyn_cast(UI->getOperand(1)); OriginalCost += TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(), - Index ? Index->getZExtValue() : -1); + Index ? Index->getZExtValue() : -1, LI); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), Align(1), LI->getPointerAddressSpace()); diff --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll --- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll +++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll @@ -21,6 +21,26 @@ ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 - ret void } + +; CHECK-LABEL: vectorInstrExtractCost +define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) { + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1 + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 2 + %t1 = extractelement <4 x i64> %vecreg, i32 1 + %t2 = extractelement <4 x i64> %vecreg, i32 2 + %ele = add i64 %t2, 1 + %cond = icmp eq i64 %t1, %ele + + ; Vector extracts - extracting each element should have a cost + ; if they are used as integers. + ; + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 0 + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3 + %t0 = extractelement <4 x i64> %vecreg, i32 0 + %t3 = extractelement <4 x i64> %vecreg, i32 3 + %val = select i1 %cond, i64 %t0 , i64 %t3 + + ret i64 %val +}