Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -413,6 +413,13 @@ private: struct TreeEntry; + /// Calculates the cost of the scalar sequence of instructions (if only \p + /// ScalarTy is specified), the vector sequence (if only \p VecTy is + /// specified) or the vector - scalar cost (if both \p ScalarTy and \p + /// VecTy are specified). + Optional getCostOrDiff(unsigned Opcode, ArrayRef VL, + Type *ScalarTy, Type *VecTy = nullptr) const; + /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); @@ -1630,6 +1637,199 @@ return true; } +Optional BoUpSLP::getCostOrDiff(unsigned Opcode, ArrayRef VL, + Type *ScalarTy, Type *VecTy) const { + assert((ScalarTy || VecTy) && + "at least one of ScalarTy/VectorTy parameters must be specified."); + assert(Opcode && "Expected non-null opcode."); + auto *VL0 = cast(VL[0]); + int VecCost; + int ScalarCost; + // Do we need the difference of the scalar and vector costs. + bool Diff = ScalarTy && VecTy; + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + if (VecTy) { + VecCost = TTI->getCastInstrCost( + Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements())); + if (!Diff) + return VecCost; + } + + // Calculate the cost of this instruction. + ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy); + break; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + // Calculate the cost of this instruction. + if (VecTy) { + VecCost = TTI->getCmpSelInstrCost( + Opcode, VecTy, + VectorType::get(Type::getInt1Ty(VL0->getContext()), + VecTy->getVectorNumElements())); + if (!Diff) + return VecCost; + } + ScalarCost = + VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy, + Type::getInt1Ty(VL0->getContext())); + break; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; + + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; + + if (VecTy) { + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, + Op2VP); + if (!Diff) + return VecCost; + } + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP); + break; + } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + if (VecTy) { + VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + if (!Diff) + return VecCost; + } + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Instruction::Add, ScalarTy, Op1VK, Op2VK); + break; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + unsigned Alignment = cast(VL0)->getAlignment(); + if (VecTy) { + VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, + /*AddressSpace=*/0); + if (!Diff) + return VecCost; + } + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + /*AddressSpace=*/0); + break; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + auto *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + if (VecTy) { + VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, + /*AddressSpace=*/0); + if (!Diff) + return VecCost; + } + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, + Alignment, /*AddressSpace=*/0); + break; + } + case Instruction::Call: { + CallInst *CI = cast(VL0); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + if (VecTy) { + // Calculate the cost of the scalar and vector calls. + SmallVector VecTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) { + VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), + VecTy->getVectorNumElements())); + } + + VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + if (!Diff) + return VecCost; + } + ScalarCost = + VL.size() * TTI->getIntrinsicInstrCost( + ID, ScalarTy, CI->getFunctionType()->params(), FMF); + break; + } + default: + return None; + } + return Diff ? VecCost - ScalarCost : ScalarCost; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; @@ -1678,7 +1878,7 @@ } return -DeadCost; } - return getGatherCost(VecTy); + return getGatherCost(VecTy);; } case Instruction::ZExt: case Instruction::SExt: @@ -1691,27 +1891,10 @@ case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - return VecCost - ScalarCost; - } + case Instruction::BitCast: case Instruction::FCmp: case Instruction::ICmp: - case Instruction::Select: { - // Calculate the cost of this instruction. - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - return VecCost - ScalarCost; - } + case Instruction::Select: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1729,107 +1912,24 @@ case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; - } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); - return VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); - int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - - return VecCost - ScalarCost; - } + case Instruction::Xor: + case Instruction::GetElementPtr: + case Instruction::Store: + return getCostOrDiff(Opcode, VL, ScalarTy, VecTy).getValue(); case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); + int Cost = getCostOrDiff(Opcode, VL, ScalarTy, VecTy).getValue(); if (!E->ShuffleMask.empty()) { - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, 0); } - return VecLdCost - ScalarLdCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); - return VecStCost - ScalarStCost; + return Cost; } case Instruction::Call: { CallInst *CI = cast(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(CI)) - FMF = FPMO->getFastMathFlags(); - - int ScalarCallCost = VecTy->getNumElements() * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); + int ScalarCallCost = getCostOrDiff(Opcode, VL, ScalarTy).getValue(); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + int VecCallCost = + getCostOrDiff(Opcode, VL, /*ScalarTy=*/nullptr, VecTy).getValue(); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -4878,6 +4978,7 @@ VisitedInstrs.clear(); + SmallVector ExtractInsts; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second)