Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -607,8 +607,10 @@ unsigned Index = -1) const; /// \return The expected cost of control-flow related instructions such as - /// Phi, Ret, Br. - int getCFInstrCost(unsigned Opcode) const; + /// Phi, Ret, Br. If PredicatedBBVF is non-zero, this is a branch before a + /// predicated block after vectorization with a VF of the passed + /// value. CostModel will pass '1' for each such branch. + int getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF = 0) const; /// \returns The expected cost of compare and select instructions. If there /// is an existing instruction that holds Opcode, it may be passed in the @@ -856,7 +858,7 @@ const Instruction *I) = 0; virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) = 0; - virtual int getCFInstrCost(unsigned Opcode) = 0; + virtual int getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF) = 0; virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) = 0; virtual int getVectorInstrCost(unsigned Opcode, Type *Val, @@ -1110,8 +1112,8 @@ unsigned Index) override { return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index); } - int getCFInstrCost(unsigned Opcode) override { - return Impl.getCFInstrCost(Opcode); + int getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF) override { + return Impl.getCFInstrCost(Opcode, PredicatedBBVF); } int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -346,7 +346,9 @@ return 1; } - unsigned getCFInstrCost(unsigned Opcode) { return 1; } + unsigned getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF) { + return 1; + } unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -533,7 +533,7 @@ VecTy->getElementType()); } - unsigned getCFInstrCost(unsigned Opcode) { + unsigned getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF = 0) { // Branches are assumed to be predicted. return 0; } Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -414,6 +414,13 @@ case Instruction::Ret: case Instruction::PHI: case Instruction::Br: { + // If LoopVectorizer produced a loop with predicated blocks, pass '1' to + // TTI before each branch. + if (const BranchInst *BI = dyn_cast(I)) + if (BI->isConditional()) + if (isa(BI->getCondition())) + return TTI->getCFInstrCost(Instruction::Br, 1); + return TTI->getCFInstrCost(I->getOpcode()); } case Instruction::Add: Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -346,8 +346,11 @@ return Cost; } -int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const { - int Cost = TTIImpl->getCFInstrCost(Opcode); +int TargetTransformInfo::getCFInstrCost(unsigned Opcode, + unsigned PredicatedBBVF) const { + assert ((Opcode == Instruction::Br || !PredicatedBBVF) && + "PredicatedBBVF should only be passed for a Br instruction"); + int Cost = TTIImpl->getCFInstrCost(Opcode, PredicatedBBVF); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -99,7 +99,7 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef()); - unsigned getCFInstrCost(unsigned Opcode); + unsigned getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF = 0); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -258,14 +258,15 @@ Opd1PropInfo, Opd2PropInfo); } -unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { +unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode, + unsigned PredicatedBBVF) { // XXX - For some reason this isn't called for switch. switch (Opcode) { case Instruction::Br: case Instruction::Ret: return 10; default: - return BaseT::getCFInstrCost(Opcode); + return BaseT::getCFInstrCost(Opcode, PredicatedVecBB); } } Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -77,6 +77,7 @@ unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); + unsigned getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF = 0); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -681,6 +681,17 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src); } +unsigned SystemZTTIImpl::getCFInstrCost(unsigned Opcode, unsigned PredicatedBBVF) { + if (Opcode == Instruction::Br) { + // Branches are generally assumed predicted, and have a cost of 0. After + // vectorization, each conditional branch is going to need an extraction + // of an element after vector compare, plus a test under mask instruction. + return PredicatedBBVF * 2; + } + + return BaseT::getCFInstrCost(Opcode); +} + int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { if (ValTy->isVectorTy()) { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2105,6 +2105,10 @@ /// pairs. typedef DenseMap ScalarCostsTy; + /// A set containing all BasicBlocks that are known to present after + /// vectorization as a predicated block. + SmallPtrSet PredicatedBBsAfterVectorization; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -6759,6 +6763,9 @@ ScalarCostsTy ScalarCosts; if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + + // Remember that BB will remain after vectorization. + PredicatedBBsAfterVectorization.insert(BB); } } } @@ -7205,7 +7212,15 @@ // instruction cost. return 0; case Instruction::Br: { - return TTI.getCFInstrCost(I->getOpcode()); + // Let TTI model the extra cost in case of a branch before a predicated + // block, by passing VF in such cases. + unsigned PredicatedBBVF = 0; + BranchInst *BI = cast(I); + if (VF > 1 && BI->isConditional() && + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + PredicatedBBVF = VF; + return TTI.getCFInstrCost(Instruction::Br, PredicatedBBVF); } case Instruction::PHI: { auto *Phi = cast(I); Index: test/Analysis/CostModel/SystemZ/branch-predicated-vectorized-block.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/SystemZ/branch-predicated-vectorized-block.ll @@ -0,0 +1,45 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s +; +; Check costs for branches inside a vectorized loop around predicated blocks. + +define void @fun(i64 *%ptr) { +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %for.body +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: br i1 %E, label %pred.store.0, label %loop1 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %loop1 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: br i1 %E1, label %pred.store.1, label %for.inc +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %for.inc +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br i1 %exitcond, label %for.end.loopexit, label %for.body + +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %0 = insertelement <2 x i64> undef, i64 %indvars.iv, i32 0 + %1 = insertelement <2 x i64> %0, i64 %indvars.iv, i32 1 + %cmp = icmp eq <2 x i64> %1, + %E = extractelement <2 x i1> %cmp, i32 0 + br i1 %E, label %pred.store.0, label %loop1 + +pred.store.0: + %arrayidx = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv + store i64 0, i64* %arrayidx + br label %loop1 + +loop1: + %E1 = extractelement <2 x i1> %cmp, i32 1 + br i1 %E1, label %pred.store.1, label %for.inc + +pred.store.1: + %arrayidx1 = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv + store i64 1, i64* %arrayidx1 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + ret void +}