Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -575,9 +575,11 @@ /// Phi, Ret, Br. int getCFInstrCost(unsigned Opcode) const; - /// \returns The expected cost of compare and select instructions. + /// \returns The expected cost of compare and select instructions. If there + /// is an existing instruction that holds Opcode, it may be passed in the + /// 'I' parameter. int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy = nullptr) const; + Type *CondTy = nullptr, const Instruction *I = nullptr) const; /// \return The expected cost of vector Insert and Extract. /// Use -1 to indicate that there is no information on the index value. @@ -809,7 +811,7 @@ VectorType *VecTy, unsigned Index) = 0; virtual int getCFInstrCost(unsigned Opcode) = 0; virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) = 0; + Type *CondTy, const Instruction *I) = 0; virtual int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) = 0; virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, @@ -1055,8 +1057,9 @@ int getCFInstrCost(unsigned Opcode) override { return Impl.getCFInstrCost(Opcode); } - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) override { - return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) override { + return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override { return Impl.getVectorInstrCost(Opcode, Val, Index); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -336,7 +336,8 @@ unsigned getCFInstrCost(unsigned Opcode) { return 1; } - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { return 1; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -319,6 +319,23 @@ return Cost; } + unsigned getScalarizationOverhead(Type *VecTy, ArrayRef Args) { + assert (VecTy->isVectorTy()); + + unsigned Cost = 0; + + Cost += getScalarizationOverhead(VecTy, true, false); + if (!Args.empty()) + Cost += getOperandsScalarizationOverhead(Args, + VecTy->getVectorNumElements()); + else + // When no information on arguments is provided, we add the cost + // associated with one argument as a heuristic. + Cost += getScalarizationOverhead(VecTy, false, true); + + return Cost; + } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost( @@ -361,15 +378,7 @@ ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - unsigned TotCost = getScalarizationOverhead(Ty, true, false) + Num * Cost; - if (!Args.empty()) - TotCost += getOperandsScalarizationOverhead(Args, Num); - else - // When no information on arguments is provided, we add the cost - // associated with one argument as a heuristic. - TotCost += getScalarizationOverhead(Ty, false, true); - - return TotCost; + return getScalarizationOverhead(Ty, Args) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -512,7 +521,8 @@ return 0; } - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -540,7 +550,7 @@ if (CondTy) CondTy = CondTy->getScalarType(); unsigned Cost = static_cast(this)->getCmpSelInstrCost( - Opcode, ValTy->getScalarType(), CondTy); + Opcode, ValTy->getScalarType(), CondTy, I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -447,12 +447,12 @@ case Instruction::Select: { const SelectInst *SI = cast(I); Type *CondTy = SI->getCondition()->getType(); - return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); - return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I); } case Instruction::Store: { const StoreInst *SI = cast(I); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -329,8 +329,8 @@ } int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { - int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy); + Type *CondTy, const Instruction *I) const { + int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -103,7 +103,8 @@ int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -436,7 +436,7 @@ } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { + Type *CondTy, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -463,7 +463,7 @@ return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -96,7 +96,8 @@ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -310,7 +310,8 @@ return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } -int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { +int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // On NEON a a vector select gets lowered to vbsl. @@ -335,7 +336,7 @@ return LT.first; } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -75,7 +75,8 @@ ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -308,8 +308,9 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -347,9 +347,13 @@ // There should be no need to check for float types other than v2f64 // since <2 x f32> isn't a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal); } // Handle floating-point types. Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -27,6 +27,8 @@ const SystemZSubtarget *getST() const { return ST; } const SystemZTargetLowering *getTLI() const { return TLI; } + unsigned const LIBCALL_COST = 30; + public: explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -53,6 +55,18 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool isFPVectorizationPotentiallyUnsafe() { return false; } + + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I); /// @} }; Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -259,11 +259,17 @@ } } if (isa(&I)) { - NumStores++; Type *MemAccessTy = I.getOperand(0)->getType(); - if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) && + if ((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) && (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128)) - NumStores++; // 128 bit fp/int stores get split. + NumStores += 2; // 128 bit fp/int stores get split. + else if (MemAccessTy->isVectorTy()) { + unsigned NumExpandedStores = + std::max(1U, MemAccessTy->getPrimitiveSizeInBits() / 128); + NumStores += NumExpandedStores; + } + else + NumStores++; } } @@ -313,3 +319,407 @@ return 0; } +int SystemZTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { + + // TODO: return a good value for BB-VECTORIZER that includes the + // immediate loads, which we do not want to count for the loop + // vectorizer, since they are hopefully hoisted out of the loop. This + // would require a new parameter 'InLoop', but not sure if constant + // args are common enough to motivate this. + + unsigned ScalarBits = Ty->getScalarSizeInBits(); + + if (Ty->isVectorTy()) { + unsigned VF = Ty->getVectorNumElements(); + assert (VF <= 16 && "VF greater than 16?"); + assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type."); + unsigned NumVectors = std::max(1U, Ty->getPrimitiveSizeInBits() / 128); + + // These vector operations are custom handled, but are still supported + // with one instruction per vector, regardless of element size. + if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || + Opcode == Instruction::AShr || Opcode == Instruction::Or) { + return NumVectors; + } + + // These FP operations are supported with a single vector instruction for + // double (base implementation assumes float generally costs 2). For + // FP128, the scalar cost is 1, and there is no overhead since the values + // are already in scalar registers. + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { + switch (ScalarBits) { + case 32: { + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args); + // FIXME: VF 2 for these FP operations are currently just as + // expensive as for VF 4. + if (VF == 2) + Cost *= 2; + return Cost; + } + case 64: + case 128: + return NumVectors; + default: + break; + } + } + + // There is no native support for FRem. + if (Opcode == Instruction::FRem) { + unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args); + // FIXME: VF 2 for float is currently just as expensive as for VF 4. + if (VF == 2 && ScalarBits == 32) + Cost *= 2; + return Cost; + } + } + else { // Scalar: + // These FP operations are supported with a dedicated instruction for + // float, double and fp128 (base implementation assumes float generally + // costs 2). + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FMul || Opcode == Instruction::FDiv) + return 1; + + // There is no native support for FRem. + if (Opcode == Instruction::FRem) + return LIBCALL_COST; + + if (Opcode == Instruction::LShr || Opcode == Instruction::AShr) + return (ScalarBits >= 32 ? 1 : 2 /*ext*/); + + // Or requires one instruction, although it has custom handling for i64. + if (Opcode == Instruction::Or) + return 1; + + // An extra extension for narrow types is needed. + if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem)) + return (ScalarBits < 32 ? 4 /*sext of ops*/ : 2); + + if (Opcode == Instruction::UDiv || Opcode == Instruction::URem) + return (ScalarBits < 32 ? 4 /*zext of both ops*/ : 3); + } + + // Fallback to the default implementation. + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo, Args); +} + +unsigned getLog2Diff(unsigned Bits0, unsigned Bits1) { + if (Bits1 > Bits0) + return (Log2_32(Bits1) - Log2_32(Bits0)); + return (Log2_32(Bits0) - Log2_32(Bits1)); +} + +int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + + unsigned DstScalarBits = Dst->getScalarSizeInBits(); + unsigned SrcScalarBits = Src->getScalarSizeInBits(); + + if (Src->isVectorTy()) { + assert (Dst->isVectorTy()); + unsigned VF = Src->getVectorNumElements(); + assert (VF <= 16 && "VF greater than 16?"); + assert (ST->hasVector() && "getCastInstrCost() called with vector type."); + unsigned SrcVectorBits = Src->getPrimitiveSizeInBits(); + unsigned NumDstVectors = std::max(1U, Dst->getPrimitiveSizeInBits() / 128); + + if (Opcode == Instruction::Trunc) { + if (SrcVectorBits <= 256) + // Up to 2 vector registers can be truncated efficiently with pack or + // permute. The latter requires an immediate mask to be loaded, which + // hopefully gets hoisted to outside the loop. + return 1; + + else if (VF == 8) // Src is <8 x i64> + // Requires several pack / permutes. + return (DstScalarBits == 32 ? 2 : 3); + + else { // 16 vector elements + // Requires multiple pack instructions + if (SrcScalarBits == 32) + return (DstScalarBits == 16 ? 2 : 3); + else // Src is <16 x i64> + return (DstScalarBits == 32 ? 4 : (DstScalarBits == 16 ? 6 : 7)); + } + + // TODO: return a good value for BB-VECTORIZER that includes the + // immediate loads, which we do not want to count for the loop + // vectorizer, since they are hopefully hoisted out of the loop. This + // would require a new parameter 'InLoop', but not sure if constant + // args are common enough to motivate this. + } + + if (Opcode == Instruction::FPTrunc) { + if (SrcScalarBits == 128) // fp128 -> double/float + return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false); + else // double -> float + return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); + } + + if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { + if (SrcScalarBits >= 8) { + // ZExt/SExt will be handled with one unpack per doubling of width. + // For types that spans multiple vector registers, some additional + // vector operations are needed. + unsigned NumUnpacks = + getLog2Diff(Src->getScalarType()->getPrimitiveSizeInBits(), + Dst->getScalarType()->getPrimitiveSizeInBits()); + return (NumUnpacks * NumDstVectors) + (NumDstVectors - 1); + } + else if (Src->isIntegerTy(1)) { + // FIXME: i1 isn't optimally treated. + // These values reflect the current handling of i1 for sext/zext. + if (Opcode == Instruction::SExt) { + static const CostTblEntry SextCostTable[] = { + { ISD::SIGN_EXTEND, MVT::v2i8, 3}, + { ISD::SIGN_EXTEND, MVT::v2i16, 3}, + { ISD::SIGN_EXTEND, MVT::v2i32, 3}, + { ISD::SIGN_EXTEND, MVT::v2i64, 2}, + { ISD::SIGN_EXTEND, MVT::v4i8, 3}, + { ISD::SIGN_EXTEND, MVT::v4i16, 3}, + { ISD::SIGN_EXTEND, MVT::v4i32, 2}, + { ISD::SIGN_EXTEND, MVT::v4i64, 6}, + { ISD::SIGN_EXTEND, MVT::v8i8, 3}, + { ISD::SIGN_EXTEND, MVT::v8i16, 2}, + { ISD::SIGN_EXTEND, MVT::v8i32, 6}, + { ISD::SIGN_EXTEND, MVT::v8i64, 13}, + { ISD::SIGN_EXTEND, MVT::v16i8, 2}, + { ISD::SIGN_EXTEND, MVT::v16i16, 6}, + { ISD::SIGN_EXTEND, MVT::v16i32, 12}, + { ISD::SIGN_EXTEND, MVT::v16i64, 23}, + }; + MVT MTy = TLI->getValueType(DL, Dst).getSimpleVT(); + if (const auto *Entry = + CostTableLookup(SextCostTable, ISD::SIGN_EXTEND, MTy)) + return Entry->Cost; + } + else { // ZExt + static const CostTblEntry ZextCostTable[] = { + { ISD::ZERO_EXTEND, MVT::v2i8, 2}, + { ISD::ZERO_EXTEND, MVT::v2i16, 2}, + { ISD::ZERO_EXTEND, MVT::v2i32, 2}, + { ISD::ZERO_EXTEND, MVT::v2i64, 1}, + { ISD::ZERO_EXTEND, MVT::v4i8, 2}, + { ISD::ZERO_EXTEND, MVT::v4i16, 2}, + { ISD::ZERO_EXTEND, MVT::v4i32, 1}, + { ISD::ZERO_EXTEND, MVT::v4i64, 4}, + { ISD::ZERO_EXTEND, MVT::v8i8, 2}, + { ISD::ZERO_EXTEND, MVT::v8i16, 1}, + { ISD::ZERO_EXTEND, MVT::v8i32, 4}, + { ISD::ZERO_EXTEND, MVT::v8i64, 12}, + { ISD::ZERO_EXTEND, MVT::v16i8, 1}, + { ISD::ZERO_EXTEND, MVT::v16i16, 4}, + { ISD::ZERO_EXTEND, MVT::v16i32, 12}, + { ISD::ZERO_EXTEND, MVT::v16i64, 32}, + }; + MVT MTy = TLI->getValueType(DL, Dst).getSimpleVT(); + if (const auto *Entry = + CostTableLookup(ZextCostTable, ISD::ZERO_EXTEND, MTy)) + return Entry->Cost; + } + } + } + + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || + Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { + // TODO: Fix base implementation which could simplify things a bit here + // (seems to miss on differentiating on scalar/vector types). + + // Only 64 bit vector conversions are natively supported. + if (SrcScalarBits == 64 && DstScalarBits == 64) + return NumDstVectors; + + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. Base implementation does not + // realize float->int gets scalarized. + unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(), + Src->getScalarType()); + unsigned TotCost = VF * ScalarCost; + bool NeedsInserts = true, NeedsExtracts = true; + // FP128 registers do not get inserted or extracted. + if (DstScalarBits == 128 && + (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) + NeedsInserts = false; + if (SrcScalarBits == 128 && + (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) + NeedsExtracts = false; + + TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts); + + // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. + if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) + TotCost *= 2; + + return TotCost; + } + + if (Opcode == Instruction::FPExt) { + if (SrcScalarBits == 32 && DstScalarBits == 64) { + // float -> double is very rare and currently unoptimized. Instead of + // using vldeb, which can do two at a time, all conversions are + // scalarized. + return VF * 2; + } + // VF * lxdb/lxeb + extraction of elements. + return VF + getScalarizationOverhead(Src, false, true); + } + } + else { // Scalar + assert (!Dst->isVectorTy()); + + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) + return (SrcScalarBits <= 64 ? + (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/) : LIBCALL_COST); + + if (Opcode == Instruction::SExt && Src->isIntegerTy(1)) + // nilf/risbgn + lcr/lcgr + return 2; + } + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + +static Type *ToVectorTy(Type *T, unsigned VF) { + if (!T->isVectorTy() && VF > 1) + return VectorType::get(T, VF); + return T; +} + +int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { + + // Hand over to common code if it's a compare for branch. + if (I != nullptr && I->hasOneUse() && + isa(I->use_begin()->getUser())) + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr); + + if (ValTy->isVectorTy()) { + unsigned VF = ValTy->getVectorNumElements(); + + // Called with a compare instruction. + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { + Type *SelectedTy = nullptr; + unsigned PredicateExtraCost = 0; + if (I != nullptr) { + assert (isa(I)); + if (I->hasOneUse()) { // FIXME: Need to handle several users? + if (SelectInst *SI = dyn_cast(I->use_begin()->getUser())) + SelectedTy = ToVectorTy(SI->getType(), VF); + } + + // Some predicates cost one or two extra instructions. + switch (dyn_cast(I)->getPredicate()) { + case CmpInst::Predicate::ICMP_NE: + case CmpInst::Predicate::ICMP_UGE: + case CmpInst::Predicate::ICMP_ULE: + case CmpInst::Predicate::ICMP_SGE: + case CmpInst::Predicate::ICMP_SLE: + PredicateExtraCost = 1; + break; + case CmpInst::Predicate::FCMP_ONE: + case CmpInst::Predicate::FCMP_ORD: + case CmpInst::Predicate::FCMP_UEQ: + case CmpInst::Predicate::FCMP_UNO: + PredicateExtraCost = 2; + break; + default: + break; + } + } + + // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of + // floats. FIXME: <2 x float> generates same code as <4 x float>. + unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); + unsigned NumVecs_cmp = + std::max(1U, ValTy->getPrimitiveSizeInBits() / 128); + unsigned NumVecs_sel = (SelectedTy != nullptr ? + std::max(1U, SelectedTy->getPrimitiveSizeInBits() / 128) : 1); + + // If the vector select is split, one compare will be done for each part. + unsigned Cost = (std::max(NumVecs_cmp, NumVecs_sel) * + (CmpCostPerVector + PredicateExtraCost)); + + // In case the select gets split, and the compared element type is + // smaller than the selected one,extra instructions are needed to move + // the values into the operands for the compares. + if (SelectedTy != nullptr && NumVecs_sel > 1 && NumVecs_cmp < NumVecs_sel) { + Cost += NumVecs_sel; + if (NumVecs_sel == 4) + Cost += (ValTy->getScalarSizeInBits() < 32 ? 3 : 2); + else if (NumVecs_sel == 8) + Cost += 6; + } + + return Cost; + } + else { // Called with a select instruction. + assert (Opcode == Instruction::Select); + + unsigned NumVecs_sel = + std::max(1U, ValTy->getPrimitiveSizeInBits() / 128); + + // We can figure out the extra cost of packing / unpacking if the + // instruction was passed and the compare instruction is found. + unsigned PackCost = 0; + if (I != nullptr) { + assert (isa(I)); + + Type *ComparedTy = nullptr; + if (CmpInst *CI = dyn_cast(I->getOperand(0))) + ComparedTy = ToVectorTy(CI->getOperand(0)->getType(), VF); + + if (ComparedTy != nullptr) { + unsigned NumVecs_cmp = + std::max(1U, ComparedTy->getPrimitiveSizeInBits() / 128); + unsigned SelScalarBits = + ValTy->getScalarType()->getPrimitiveSizeInBits(); + unsigned CmpScalarBits = + ComparedTy->getScalarType()->getPrimitiveSizeInBits(); + unsigned Log2Diff = getLog2Diff(SelScalarBits, CmpScalarBits); + unsigned PacksPerVector = Log2Diff; + if (Log2Diff > 1 && NumVecs_sel <= 2 && NumVecs_cmp <= 2 && + CmpScalarBits > SelScalarBits) + PacksPerVector = 1; // permute used instead. + + // More work is done with very different element types and high + // vectorization factors. + if (Log2Diff > 1 && NumVecs_cmp > 2) + PacksPerVector += ((Log2Diff - 1) * (NumVecs_cmp / 4)); + + PackCost = PacksPerVector * NumVecs_sel; + + if (CmpScalarBits == 64 && SelScalarBits == 16 && VF == 16) + PackCost -= 2; // Minor adjustment + } + } + + return NumVecs_sel /*vsel*/ + PackCost; + } + } + else { // Scalar + switch (Opcode) { + case Instruction::ICmp: { + unsigned Cost = 1; + if (ValTy->getScalarSizeInBits() <= 16) + Cost += 2; // extend both operands + return Cost; + } + case Instruction::Select: + if (ValTy->isFloatingPointTy()) + return 4; // No load on condition for FP, so this costs a conditional jump. + return 1; // Load On Condition. + } + } + + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr); +} Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -62,7 +62,8 @@ ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1300,7 +1300,8 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1366,7 +1367,7 @@ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1351,90 +1351,135 @@ if (RHSShuffle && RHSOp0Width == LHSWidth) { newRHS = RHSOp0; } + // case 4 - if (LHSOp0 == RHSOp0) { + bool EqOp0s = (LHSOp0 == RHSOp0); + if (!EqOp0s && LHSOp0 != nullptr && RHSOp0 != nullptr) { + if (Instruction* LHSOp0Inst = dyn_cast(LHSOp0)) { + if (Instruction* RHSOp0Inst = dyn_cast(RHSOp0)) { + if (LHSOp0Inst->isIdenticalTo(RHSOp0Inst) && + !LHSOp0Inst->mayHaveSideEffects() && !LHSOp0Inst->mayReadFromMemory() && + !RHSOp0Inst->mayHaveSideEffects() && !RHSOp0Inst->mayReadFromMemory()) { + EqOp0s = true; + } + } + } + } + if (EqOp0s) { newLHS = LHSOp0; newRHS = nullptr; } - if (newLHS == LHS && newRHS == RHS) - return MadeChange ? &SVI : nullptr; - - SmallVector LHSMask; - SmallVector RHSMask; - if (newLHS != LHS) - LHSMask = LHSShuffle->getShuffleMask(); - if (RHSShuffle && newRHS != RHS) - RHSMask = RHSShuffle->getShuffleMask(); - - unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth; SmallVector newMask; - bool isSplat = true; - int SplatElt = -1; - // Create a new mask for the new ShuffleVectorInst so that the new - // ShuffleVectorInst is equivalent to the original one. - for (unsigned i = 0; i < VWidth; ++i) { - int eltMask; - if (Mask[i] < 0) { - // This element is an undef value. - eltMask = -1; - } else if (Mask[i] < (int)LHSWidth) { - // This element is from left hand side vector operand. - // - // If LHS is going to be replaced (case 1, 2, or 4), calculate the - // new mask value for the element. - if (newLHS != LHS) { - eltMask = LHSMask[Mask[i]]; - // If the value selected is an undef value, explicitly specify it - // with a -1 mask value. - if (eltMask >= (int)LHSOp0Width && isa(LHSOp1)) + + if (newLHS == LHS && newRHS == RHS) { + if (LHSShuffle != nullptr && RHSShuffle != nullptr) { + SmallVector LHSMask; + SmallVector RHSMask; + LHSMask = LHSShuffle->getShuffleMask(); + RHSMask = RHSShuffle->getShuffleMask(); + unsigned LHSShuffle_Width = cast(LHSShuffle->getOperand(0)->getType())->getNumElements(); + for (unsigned i = 0; i < VWidth; ++i) { + int eltMask; + + if (Mask[i] < 0) { + // This element is an undef value. eltMask = -1; - } else - eltMask = Mask[i]; - } else { - // This element is from right hand side vector operand - // - // If the value selected is an undef value, explicitly specify it - // with a -1 mask value. (case 1) - if (isa(RHS)) + } else if (Mask[i] < (int)LHSWidth) { + // This element is from left hand side vector operand. + // + eltMask = LHSMask[Mask[i]]; + } else { + // This element is from right hand side vector operand + // + eltMask = RHSMask[Mask[i] - LHSWidth] + LHSShuffle_Width; + } + + newMask.push_back(eltMask); + } + newLHS = LHSShuffle->getOperand(0); + newRHS = RHSShuffle->getOperand(0); + } + else { + return MadeChange ? &SVI : nullptr; + } + } + else { + SmallVector LHSMask; + SmallVector RHSMask; + if (newLHS != LHS) + LHSMask = LHSShuffle->getShuffleMask(); + if (RHSShuffle && newRHS != RHS) + RHSMask = RHSShuffle->getShuffleMask(); + + unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth; + bool isSplat = true; + int SplatElt = -1; + // Create a new mask for the new ShuffleVectorInst so that the new + // ShuffleVectorInst is equivalent to the original one. + for (unsigned i = 0; i < VWidth; ++i) { + int eltMask; + if (Mask[i] < 0) { + // This element is an undef value. eltMask = -1; - // If RHS is going to be replaced (case 3 or 4), calculate the - // new mask value for the element. - else if (newRHS != RHS) { - eltMask = RHSMask[Mask[i]-LHSWidth]; + } else if (Mask[i] < (int)LHSWidth) { + // This element is from left hand side vector operand. + // + // If LHS is going to be replaced (case 1, 2, or 4), calculate the + // new mask value for the element. + if (newLHS != LHS) { + eltMask = LHSMask[Mask[i]]; + // If the value selected is an undef value, explicitly specify it + // with a -1 mask value. + if (eltMask >= (int)LHSOp0Width && isa(LHSOp1)) + eltMask = -1; + } else + eltMask = Mask[i]; + } else { + // This element is from right hand side vector operand + // // If the value selected is an undef value, explicitly specify it - // with a -1 mask value. - if (eltMask >= (int)RHSOp0Width) { - assert(isa(RHSShuffle->getOperand(1)) - && "should have been check above"); + // with a -1 mask value. (case 1) + if (isa(RHS)) eltMask = -1; - } - } else - eltMask = Mask[i]-LHSWidth; - - // If LHS's width is changed, shift the mask value accordingly. - // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any - // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. - // If newRHS == newLHS, we want to remap any references from newRHS to - // newLHS so that we can properly identify splats that may occur due to - // obfuscation across the two vectors. - if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) - eltMask += newLHSWidth; - } + // If RHS is going to be replaced (case 3 or 4), calculate the + // new mask value for the element. + else if (newRHS != RHS) { + eltMask = RHSMask[Mask[i]-LHSWidth]; + // If the value selected is an undef value, explicitly specify it + // with a -1 mask value. + if (eltMask >= (int)RHSOp0Width) { + assert(isa(RHSShuffle->getOperand(1)) + && "should have been check above"); + eltMask = -1; + } + } else + eltMask = Mask[i]-LHSWidth; + + // If LHS's width is changed, shift the mask value accordingly. + // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any + // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. + // If newRHS == newLHS, we want to remap any references from newRHS to + // newLHS so that we can properly identify splats that may occur due to + // obfuscation across the two vectors. + if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) + eltMask += newLHSWidth; + } - // Check if this could still be a splat. - if (eltMask >= 0) { - if (SplatElt >= 0 && SplatElt != eltMask) - isSplat = false; - SplatElt = eltMask; - } + // Check if this could still be a splat. + if (eltMask >= 0) { + if (SplatElt >= 0 && SplatElt != eltMask) + isSplat = false; + SplatElt = eltMask; + } - newMask.push_back(eltMask); + newMask.push_back(eltMask); + } } // If the result mask is equal to one of the original shuffle masks, // or is a splat, do the replacement. - if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { +// if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { SmallVector Elts; for (unsigned i = 0, e = newMask.size(); i != e; ++i) { if (newMask[i] < 0) { @@ -1445,8 +1490,8 @@ } if (!newRHS) newRHS = UndefValue::get(newLHS->getType()); - return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); - } + return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); +// } // If the result mask is an identity, replace uses of this instruction with // corresponding argument. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6844,16 +6844,27 @@ if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { + // If this is the loop-latch compare for the back branch, just add the + // scalar value. Should this check be done in caller instead? + bool LikelyVectorized = true; + if (I->hasOneUse()) { + if (BranchInst *BI = dyn_cast(I->use_begin()->getUser())) { + if (BI->getParent() == TheLoop->getLoopLatch()) + LikelyVectorized = false; + } + } Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); - VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); + + if (LikelyVectorized) + VectorTy = ToVectorTy(ValTy, VF); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); } case Instruction::Store: case Instruction::Load: {