Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -413,6 +413,11 @@ private: struct TreeEntry; + /// Calculates the cost of the transformation of \p VL instructions from + /// scalar to vector form. + Optional getCost(unsigned Opcode, ArrayRef VL, Type *ScalarTy, + Type *VecTy) const; + /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); @@ -1627,8 +1632,175 @@ return true; } +Optional BoUpSLP::getCost(unsigned Opcode, ArrayRef VL, + Type *ScalarTy, Type *VecTy) const { + assert(ScalarTy && VecTy && + "both ScalarTy/VectorTy parameters must be specified."); + assert(Opcode && "Expected non-null opcode."); + auto *VL0 = cast(VL[0]); + int VecCost; + int ScalarCost; + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + VecCost = TTI->getCastInstrCost( + Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements())); + + // Calculate the cost of this instruction. + ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy); + break; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + // Calculate the cost of this instruction. + VecCost = TTI->getCmpSelInstrCost( + Opcode, VecTy, + VectorType::get(Type::getInt1Ty(VL0->getContext()), + VecTy->getVectorNumElements())); + ScalarCost = + VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy, + Type::getInt1Ty(VL0->getContext())); + break; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; + + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; + + VecCost = + TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP); + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP); + break; + } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Instruction::Add, ScalarTy, Op1VK, Op2VK); + break; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + unsigned Alignment = cast(VL0)->getAlignment(); + VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, + /*AddressSpace=*/0); + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + /*AddressSpace=*/0); + break; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + auto *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, + /*AddressSpace=*/0); + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, + Alignment, /*AddressSpace=*/0); + break; + } + case Instruction::Call: { + CallInst *CI = cast(VL0); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + // Calculate the cost of the scalar and vector calls. + SmallVector VecTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) { + VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), + VecTy->getVectorNumElements())); + } + + VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + ScalarCost = + VL.size() * TTI->getIntrinsicInstrCost( + ID, ScalarTy, CI->getFunctionType()->params(), FMF); + DEBUG(dbgs() << "SLP: Call cost " << VecCost - ScalarCost << " (" << VecCost + << "-" << ScalarCost << ")" + << " for " << *CI << "\n"); + + break; + } + default: + return None; + } + return VecCost - ScalarCost; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { - ArrayRef VL = E->Scalars; + ArrayRef VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) @@ -1651,218 +1823,67 @@ } unsigned Opcode = getSameOpcode(VL); assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(VL[0]); switch (Opcode) { - case Instruction::PHI: { - return 0; - } - case Instruction::ExtractValue: - case Instruction::ExtractElement: { - if (canReuseExtract(VL, Opcode)) { - int DeadCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (E->hasOneUse() || - std::all_of(E->user_begin(), E->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; - })) - // Take credit for instruction that will become dead. - DeadCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - } - return -DeadCost; - } - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - return VecCost - ScalarCost; - } - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: { - // Calculate the cost of this instruction. - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - return VecCost - ScalarCost; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; - } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); - return VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); - int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - - return VecCost - ScalarCost; - } - case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); - if (E->NeedToShuffle) { - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); + case Instruction::PHI: + return 0; + case Instruction::ExtractValue: + case Instruction::ExtractElement: + if (canReuseExtract(VL, Opcode)) { + int DeadCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *E = cast(VL[i]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (E->hasOneUse() || + std::all_of(E->user_begin(), E->user_end(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0; + })) + // Take credit for instruction that will become dead. + DeadCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return VecLdCost - ScalarLdCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); - return VecStCost - ScalarStCost; + return -DeadCost; } - case Instruction::Call: { - CallInst *CI = cast(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(CI)) - FMF = FPMO->getFastMathFlags(); - - int ScalarCallCost = VecTy->getNumElements() * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); - - DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); - - return VecCallCost - ScalarCallCost; + return getGatherCost(VecTy); + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (Value *i : VL) { + Instruction *I = cast(i); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); } - case Instruction::ShuffleVector: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; - int ScalarCost = 0; - int VecCost = 0; - for (Value *i : VL) { - Instruction *I = cast(i); - if (!I) - break; - ScalarCost += - TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } - // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. - Instruction *I0 = cast(VL[0]); - VecCost = - TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); - Instruction *I1 = cast(VL[1]); - VecCost += - TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); - return VecCost - ScalarCost; + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } + case Instruction::Load: { + int Cost = getCost(Opcode, VL, ScalarTy, VecTy).getValue(); + if (E->NeedToShuffle) { + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, 0); } - default: - llvm_unreachable("Unknown instruction"); + return Cost; + } + default: + if (Optional Cost = getCost(Opcode, VL, ScalarTy, VecTy)) + return Cost.getValue(); + break; } + llvm_unreachable("Unknown instruction"); } bool BoUpSLP::isFullyVectorizableTinyTree() {