Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -568,6 +568,9 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// Return true if target will expand (scalarize) this vector instruction. + bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) const; + /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to /// the scalarization cost of a load/store. @@ -761,12 +764,16 @@ /// \p Args is an optional argument which holds the instruction operands /// values so the TTI can analyze those values searching for special /// cases or optimizations based on those values. + /// \p Insert and Extracts arguments may be used to let it be known that in + /// the case target will scalarize this instruction, there is no need to do + /// insert and/or extract operations. int getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, OperandValueKind Opd2Info = OK_AnyValue, OperandValueProperties Opd1PropInfo = OP_None, OperandValueProperties Opd2PropInfo = OP_None, - ArrayRef Args = ArrayRef()) const; + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and @@ -776,9 +783,10 @@ /// \return The expected cost of cast instructions, such as bitcast, trunc, /// zext, etc. If there is an existing instruction that holds Opcode, it - /// may be passed in the 'I' parameter. + /// may be passed in the 'I' parameter. For Insert/Extract, see comment above. int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr) const; + const Instruction *I = nullptr, bool Insert = true, + bool Extract = true) const; /// \return The expected cost of a sign- or zero-extended vector extract. Use /// -1 to indicate that there is no information about the index value. @@ -1073,6 +1081,7 @@ getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual const MemCmpExpansionOptions *enableMemCmpExpansion( @@ -1115,11 +1124,13 @@ OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, - ArrayRef Args) = 0; + ArrayRef Args, + bool Insert, ArrayRef Extracts) = 0; virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) = 0; + const Instruction *I, bool Insert, + bool Extract) = 0; virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) = 0; virtual int getCFInstrCost(unsigned Opcode) = 0; @@ -1340,6 +1351,10 @@ return Impl.getOperandsScalarizationOverhead(Args, VF); } + bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) override { + return Impl.isVecInstrExpanded(Opcode, VecTy); + } + bool supportsEfficientVectorElementLoadStore() override { return Impl.supportsEfficientVectorElementLoadStore(); } @@ -1440,17 +1455,19 @@ OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, - ArrayRef Args) override { + ArrayRef Args, + bool Insert, ArrayRef Extracts) override { return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) override { return Impl.getShuffleCost(Kind, Tp, Index, SubTp); } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) override { - return Impl.getCastInstrCost(Opcode, Dst, Src, I); + const Instruction *I, bool Insert, + bool Extract) override { + return Impl.getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -302,6 +302,8 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) { return false; } + bool supportsEfficientVectorElementLoadStore() { return false; } bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } @@ -405,7 +407,8 @@ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, - ArrayRef Args) { + ArrayRef Args, + bool Insert, ArrayRef Extracts) { return 1; } @@ -415,7 +418,8 @@ } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { return 1; } + const Instruction *I, bool Insert, + bool Extract) { return 1; } unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -502,15 +502,24 @@ return Cost; } - unsigned getScalarizationOverhead(Type *VecTy, ArrayRef Args) { + unsigned getScalarizationOverhead(Type *VecTy, ArrayRef Args, + bool Insert = true, ArrayRef Extracts = ArrayRef()) { assert(VecTy->isVectorTy()); unsigned Cost = 0; - Cost += getScalarizationOverhead(VecTy, true, false); - if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, + if (Insert) + Cost += getScalarizationOverhead(VecTy, true, false); + if (!Args.empty()) { + SmallVector VecArgs; + for (unsigned i = 0; i < Args.size(); i++) { + if (Extracts.size() > i && !Extracts[i]) + continue; + VecArgs.push_back(Args[i]); + } + Cost += getOperandsScalarizationOverhead(VecArgs, VecTy->getVectorNumElements()); + } else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. @@ -527,7 +536,8 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()) { + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()) { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -561,7 +571,7 @@ ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(Ty, Args) + Num * Cost; + return getScalarizationOverhead(Ty, Args, Insert, Extracts) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -585,7 +595,8 @@ } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr) { + const Instruction *I = nullptr, + bool Insert = true, bool Extract = true) { const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -681,7 +692,8 @@ Src->getVectorNumElements() / 2); T *TTI = static_cast(this); return TTI->getVectorSplitCost() + - (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); + (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I, + Insert, Extract)); } // In other cases where the source or destination are illegal, assume @@ -692,7 +704,7 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(Dst, true, true) + Num * Cost; + return getScalarizationOverhead(Dst, Insert, Extract) + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -251,6 +251,11 @@ return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +bool TargetTransformInfo:: +isVecInstrExpanded(unsigned Opcode, Type *VecTy) const { + return TTIImpl->isVecInstrExpanded(Opcode, VecTy); +} + bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { return TTIImpl->supportsEfficientVectorElementLoadStore(); } @@ -435,9 +440,10 @@ unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, - ArrayRef Args) const { + ArrayRef Args, + bool Insert, ArrayRef Extracts) const { int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -450,10 +456,11 @@ } int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src, const Instruction *I) const { + Type *Src, const Instruction *I, bool Insert, bool Extract) const { assert ((I == nullptr || I->getOpcode() == Opcode) && "Opcode should reflect passed instruction."); - int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I); + int Cost = + TTIImpl->getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -111,7 +111,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, + bool Insert = true, bool Extract = true); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); @@ -124,7 +125,8 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -259,7 +259,7 @@ } int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { + const Instruction *I, bool Insert, bool Extract) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -287,7 +287,7 @@ EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); static const TypeConversionCostTblEntry ConversionTbl[] = { @@ -391,7 +391,7 @@ SrcTy.getSimpleVT())) return Entry->Cost; - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -477,7 +477,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -496,7 +497,7 @@ switch (ISD) { default: return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); case ISD::SDIV: if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -543,7 +544,7 @@ } Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); if (Ty->isVectorTy()) { // On AArch64, vector divisions are not supported natively and are // expanded into scalar divisions of each pair of elements. @@ -552,7 +553,8 @@ Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); // TODO: if one of the arguments is scalar, then it's not necessary to - // double the cost of handling the vector elements. + // double the cost of handling the vector elements. Note: this may be + // handled by implementing isVecInstrExpanded(). Cost += Cost; } return Cost; Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -165,7 +165,8 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); unsigned getCFInstrCost(unsigned Opcode); Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -335,11 +335,12 @@ int GCNTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args ) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } // Legalize the type. @@ -439,7 +440,7 @@ } return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -146,7 +146,8 @@ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, + bool Insert = true, bool Extract = true); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); @@ -162,7 +163,8 @@ TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I = nullptr); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -134,7 +134,7 @@ } int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { + const Instruction *I, bool Insert, bool Extract) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -157,7 +157,7 @@ EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. @@ -324,7 +324,7 @@ return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -478,7 +478,7 @@ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, - ArrayRef Args) { + ArrayRef Args, bool Insert, ArrayRef Extracts) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -530,7 +530,7 @@ return LT.first * Entry->Cost; int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); // This is somewhat of a hack. The problem that we are facing is that SROA // creates a sequence of shift, and, or instructions to construct values. Index: lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -131,9 +131,11 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, + bool Insert = true, bool Extract = true); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); unsigned getCFInstrCost(unsigned Opcode) { Index: lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -226,18 +226,19 @@ unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { if (Ty->isVectorTy()) { std::pair LT = TLI.getTypeLegalizationCost(DL, Ty); if (LT.second.isFloatingPoint()) return LT.first + FloatFactor * getTypeNumElements(Ty); } return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, - Type *SrcTy, const Instruction *I) { + Type *SrcTy, const Instruction *I, bool Insert, bool Extract) { if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; Index: lib/Target/Lanai/LanaiTargetTransformInfo.h =================================================================== --- lib/Target/Lanai/LanaiTargetTransformInfo.h +++ lib/Target/Lanai/LanaiTargetTransformInfo.h @@ -82,13 +82,14 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()) { + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()) { int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); case ISD::MUL: case ISD::SDIV: case ISD::UDIV: Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -92,7 +92,8 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -115,7 +115,8 @@ int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -124,7 +125,7 @@ switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -137,7 +138,7 @@ return 2 * LT.first; // Delegate other cases to the basic TTI. return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -77,10 +77,11 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, bool Insert = true, bool Extract = true); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -327,12 +327,13 @@ int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -349,10 +350,10 @@ } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { + const Instruction *I, bool Insert, bool Extract) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -70,18 +70,21 @@ bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } + bool isVecInstrExpanded(unsigned Opcode, Type *VecTy); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, + bool Insert = true, bool Extract = true); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -347,12 +347,28 @@ return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); } +bool SystemZTTIImpl::isVecInstrExpanded(unsigned Opcode, Type *VecTy) { + assert(VecTy->isVectorTy() || VecTy->isVoidTy()); + // It seems these opcodes translate to expanded vector DAG nodes here, but + // they are in fact not. + if ((Opcode == Instruction::Select) || (Opcode == Instruction::SExt) || + (Opcode == Instruction::ZExt) || (Opcode == Instruction::Trunc)) + return false; + const TargetLoweringBase *TLI = getTLI(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (!ISD) + return false; + std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); + return TLI->isOperationExpand(ISD, LT.second); +} + int SystemZTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, - ArrayRef Args) { + ArrayRef Args, + bool Insert, ArrayRef Extracts) { // TODO: return a good value for BB-VECTORIZER that includes the // immediate loads, which we do not want to count for the loop @@ -408,7 +424,8 @@ if (DivRemConstPow2) return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); if (DivRemConst) - return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args); + return VF * DivMulSeqCost + + getScalarizationOverhead(Ty, Args, Insert, Extracts); if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer // division/remainder, which will get scalarized and handled with @@ -431,7 +448,8 @@ // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); - unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args); + unsigned Cost = (VF * ScalarCost) + + getScalarizationOverhead(Ty, Args, Insert, Extracts); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -448,7 +466,8 @@ // There is no native support for FRem. if (Opcode == Instruction::FRem) { - unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args); + unsigned Cost = (VF * LIBCALL_COST) + + getScalarizationOverhead(Ty, Args, Insert, Extracts); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; @@ -494,7 +513,7 @@ // Fallback to the default implementation. return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo, Args); + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -636,7 +655,7 @@ } int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { + const Instruction *I, bool Insert, bool Extract) { unsigned DstScalarBits = Dst->getScalarSizeInBits(); unsigned SrcScalarBits = Src->getScalarSizeInBits(); @@ -696,16 +715,15 @@ unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType()); unsigned TotCost = VF * ScalarCost; - bool NeedsInserts = true, NeedsExtracts = true; // FP128 registers do not get inserted or extracted. if (DstScalarBits == 128 && (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) - NeedsInserts = false; + Insert = false; if (SrcScalarBits == 128 && (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) - NeedsExtracts = false; + Extract = false; - TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts); + TotCost += getScalarizationOverhead(Dst, Insert, Extract); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -716,7 +734,8 @@ if (Opcode == Instruction::FPTrunc) { if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. - return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false); + return VF /*ldxbr/lexbr*/ + + getScalarizationOverhead(Dst, Insert, false); else // double -> float return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); } @@ -729,7 +748,7 @@ return VF * 2; } // -> fp128. VF * lxdb/lxeb + extraction of elements. - return VF + getScalarizationOverhead(Src, false, true); + return VF + getScalarizationOverhead(Src, false, Extract); } } else { // Scalar @@ -758,7 +777,7 @@ } } - return BaseT::getCastInstrCost(Opcode, Dst, Src, I); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h =================================================================== --- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -62,7 +62,8 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); /// @} Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp =================================================================== --- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -46,7 +46,8 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, + bool Insert, ArrayRef Extracts) { unsigned Cost = BasicTTIImplBase::getArithmeticInstrCost( Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -68,10 +68,11 @@ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + bool Insert = true, ArrayRef Extracts = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr); + const Instruction *I = nullptr, bool Insert = true, bool Extract = true); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -174,7 +174,8 @@ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, - ArrayRef Args) { + ArrayRef Args, + bool Insert, ArrayRef Extracts) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -853,7 +854,8 @@ } // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts); } int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -1194,7 +1196,7 @@ } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I) { + const Instruction *I, bool Insert, bool Extract) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -1566,7 +1568,7 @@ return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src, I); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1057,6 +1057,7 @@ setCostBasedWideningDecision(VF); collectLoopUniforms(VF); collectLoopScalars(VF); + collectTargetScalarized(VF); } /// Returns true if the target machine supports masked store operation @@ -1246,6 +1247,8 @@ /// The data is collected per VF. DenseMap> Scalars; + DenseMap> TargetScalarized; + /// Holds the instructions (address computations) that are forced to be /// scalarized. DenseMap> ForcedScalars; @@ -1276,6 +1279,11 @@ /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); + void collectTargetScalarized(unsigned VF); + + bool isTargetScalarizedIns(const Value *V, unsigned VF); + bool hasOnlyTargetScalarizedUses(const Instruction *I, unsigned VF); + /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. using DecisionList = DenseMap, @@ -4296,6 +4304,44 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } +void LoopVectorizationCostModel::collectTargetScalarized(unsigned VF) { + assert(VF >= 2 && TargetScalarized.find(VF) == TargetScalarized.end() && + "This function should not be visited twice for the same VF"); + + for (auto *BB : TheLoop->blocks()) + for (auto &I : *BB) { + Type *ScalarTy = I.getType(); + if (StoreInst *SI = dyn_cast(&I)) + ScalarTy = SI->getValueOperand()->getType(); + Type *VecTy = ToVectorTy(ScalarTy, VF); + if (TTI.isVecInstrExpanded(I.getOpcode(), VecTy)) + TargetScalarized[VF].insert(&I); + } +} + +bool LoopVectorizationCostModel:: +isTargetScalarizedIns(const Value *V, unsigned VF) { + if (auto *I = dyn_cast(V)) { + if (!TheLoop->contains(I)) + // Assume extraction is done in preheader. + return true; + if (TargetScalarized[VF].find(I) != TargetScalarized[VF].end()) + return true; + } + return false; +} + +bool LoopVectorizationCostModel:: +hasOnlyTargetScalarizedUses(const Instruction *I, unsigned VF) { + for (const Use &U : I->uses()) { + const Instruction *UI = cast(U.getUser()); + if (TargetScalarized[VF].find(UI) == TargetScalarized[VF].end() && + TheLoop->contains(UI)) + return false; + } + return true; +} + bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { if (!blockNeedsPredication(I->getParent())) return false; @@ -5815,12 +5861,18 @@ TTI.getOperandInfo(Op2, Op2VP); if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) Op2VK = TargetTransformInfo::OK_UniformValue; - SmallVector Operands(I->operand_values()); + bool Insert = true; + SmallVector Extracts(Operands.size(), true); + if (VF > 1 && isTargetScalarizedIns(I, VF)) { + Insert = !hasOnlyTargetScalarizedUses(I, VF); + for (unsigned i = 0; i < Operands.size(); i++) + Extracts[i] = !isTargetScalarizedIns(Operands[i], VF); + } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); + I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, Op2VK, + TargetTransformInfo::OP_None, Op2VP, Operands, Insert, Extracts); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -5897,8 +5949,15 @@ } } + bool Insert = true; + bool Extract = true; + if (VF > 1 && isTargetScalarizedIns(I, VF)) { + Insert = !hasOnlyTargetScalarizedUses(I, VF); + Extract = !isTargetScalarizedIns(I->getOperand(0), VF); + } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I, + Insert, Extract); } case Instruction::Call: { bool NeedToScalarize;