Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -606,6 +606,13 @@ /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); + /// \returns the cost of the scalar instruction \p I. + int getScalarCost(Instruction *I, Type *DstSclTy); + + /// \returns the cost of a vectorized version of the scalar instruction \p I. + int getVectorCost(Instruction *I, ArrayRef VL, Type *DstSclTy, + VectorType *DstVecTy); + /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, int); @@ -2184,40 +2191,32 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy, VL0); + int ScalarCost = VL.size() * ScalarEltCost; + Type *SrcTy = VL0->getOperand(0)->getType(); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); + VecCost = ReuseShuffleCost + getVectorCost(VL0, VL, ScalarTy, VecTy); } return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { - // Calculate the cost of this instruction. + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); - } - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int VecCost = getVectorCost(VL0, VL, ScalarTy, VecTy); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2238,88 +2237,31 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_PowerOf2; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; - } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - - SmallVector Operands(VL0->operand_values()); + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getArithmeticInstrCost(S.getOpcode(), ScalarTy, Op1VK, Op2VK, - Op1VP, Op2VP, Operands); - } - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(S.getOpcode(), ScalarTy, Op1VK, Op2VK, - Op1VP, Op2VP, Operands); - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int VecCost = getVectorCost(VL0, VL, ScalarTy, VecTy); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getArithmeticInstrCost(Instruction::Add, - ScalarTy, Op1VK, Op2VK); - } - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); - int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int VecCost = getVectorCost(VL0, VL, ScalarTy, VecTy); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = cast(VL0)->getAlignment(); + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, - alignment, 0, VL0); - } - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0, VL0); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + int VecLdCost = getVectorCost(VL0, VL, ScalarTy, VecTy); if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -2329,46 +2271,25 @@ } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = cast(VL0)->getAlignment(); + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, - alignment, 0, VL0); - } - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0, VL0); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; + int VecStCost = getVectorCost(VL0, VL, ScalarTy, VecTy); return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { - CallInst *CI = cast(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(CI)) - FMF = FPMO->getFastMathFlags(); - + int ScalarEltCost = getScalarCost(VL0, ScalarTy); if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - } - int ScalarCallCost = VecTy->getNumElements() * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - - SmallVector Args(CI->arg_operands()); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, - VecTy->getNumElements()); + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + int VecCallCost = getVectorCost(VL0, VL, ScalarTy, VecTy); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); + << " for " << *cast(VL0) << "\n"); return ReuseShuffleCost + VecCallCost - ScalarCallCost; } @@ -2379,20 +2300,17 @@ (Instruction::isCast(S.getOpcode()) && Instruction::isCast(S.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost -= getScalarCost(I, I->getType()); } for (Value *V : VL) { Instruction *I = cast(V); - ReuseShuffleCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost += getScalarCost(I, I->getType()); } } - int VecCost = 0; + int ScalarCost = 0; for (Value *i : VL) { Instruction *I = cast(i); assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); @@ -2401,17 +2319,9 @@ } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); - } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); - VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); - VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); - } + int VecCost = 0; + VecCost = getVectorCost(S.MainOp, VL, S.MainOp->getType(), VecTy); + VecCost += getVectorCost(S.AltOp, VL, S.AltOp->getType(), VecTy); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -2420,6 +2330,201 @@ } } +int BoUpSLP::getScalarCost(Instruction *I, Type *DstTy) { + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::PHI: + return 0; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = I->getOperand(0)->getType(); + return TTI->getCastInstrCost(Opcode, DstTy, SrcTy, I); + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + return TTI->getCmpSelInstrCost(Opcode, DstTy, Builder.getInt1Ty(), I); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TTI->getInstructionCost(I, TargetTransformInfo::TCK_RecipThroughput); + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + return TTI->getArithmeticInstrCost(Instruction::Add, DstTy, Op1VK, Op2VK); + } + case Instruction::Load: { + unsigned alignment = cast(I)->getAlignment(); + return TTI->getMemoryOpCost(Instruction::Load, DstTy, alignment, 0, I); + } + case Instruction::Store: { + unsigned alignment = cast(I)->getAlignment(); + return TTI->getMemoryOpCost(Instruction::Store, DstTy, alignment, 0, I); + } + case Instruction::Call: { + CallInst *CI = cast(I); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + SmallVector ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) + ScalarTys.push_back(CI->getArgOperand(op)->getType()); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + return TTI->getIntrinsicInstrCost(ID, DstTy, ScalarTys, FMF); + } + } + + llvm_unreachable("Unknown instruction"); +} + +int BoUpSLP::getVectorCost(Instruction *I, ArrayRef VL, Type *DstSclTy, + VectorType *DstVecTy) { + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::PHI: + return 0; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcSclTy = I->getOperand(0)->getType(); + Type *SrcVecTy = VectorType::get(SrcSclTy, VL.size()); + return TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, I); + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + Type *MaskSclTy = Builder.getInt1Ty(); + Type *MaskVecTy = VectorType::get(MaskSclTy, VL.size()); + return TTI->getCmpSelInstrCost(Opcode, DstVecTy, MaskVecTy, I); + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_PowerOf2; + + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt0 = nullptr; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + const Instruction *I = cast(VL[i]); + ConstantInt *CInt = dyn_cast(I->getOperand(1)); + if (!CInt) { + Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; + break; + } + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (i == 0) { + CInt0 = CInt; + continue; + } + if (CInt0 != CInt) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } + SmallVector Operands(I->operand_values()); + return TTI->getArithmeticInstrCost(Opcode, DstVecTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); + } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + return TTI->getArithmeticInstrCost(Instruction::Add, DstVecTy, Op1VK, + Op2VK); + } + case Instruction::Load: { + unsigned alignment = cast(I)->getAlignment(); + return TTI->getMemoryOpCost(Instruction::Load, DstVecTy, alignment, 0, I); + } + case Instruction::Store: { + unsigned alignment = cast(I)->getAlignment(); + return TTI->getMemoryOpCost(Instruction::Store, DstVecTy, alignment, 0, I); + } + case Instruction::Call: { + CallInst *CI = cast(I); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + SmallVector Args(CI->arg_operands()); + return TTI->getIntrinsicInstrCost(ID, DstSclTy, Args, FMF, VL.size()); + } + } + + llvm_unreachable("Unknown instruction"); +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n");