Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -760,6 +760,9 @@ Val->getScalarType()->isIntegerTy(64)) return ((Index % 2 == 0) ? 1 : 0); + if (Opcode == Instruction::ExtractElement) + return ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1); + return BaseT::getVectorInstrCost(Opcode, Val, Index); } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2105,6 +2105,10 @@ /// pairs. typedef DenseMap ScalarCostsTy; + /// A set containing all BasicBlocks that are known to present after + /// vectorization as a predicated block. + SmallPtrSet PredicatedBBsAfterVectorization; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -6757,6 +6761,9 @@ ScalarCostsTy ScalarCosts; if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + + // Remember that BB will remain after vectorization. + PredicatedBBsAfterVectorization.insert(BB); } } } @@ -7203,7 +7210,28 @@ // instruction cost. return 0; case Instruction::Br: { - return TTI.getCFInstrCost(I->getOpcode()); + // In cases of scalarized and predicated instructions, there will be VF + // predicated blocks in the vectorized loop. Each branch around these + // blocks requires also an extract of its vector compare i1 element. + bool ScalarPredicatedBB = false; + BranchInst *BI = cast(I); + if (VF > 1 && BI->isConditional() && + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + ScalarPredicatedBB = true; + + if (ScalarPredicatedBB) { + // Return cost for branches around scalarized and predicated blocks. + Type *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + + (TTI.getCFInstrCost(Instruction::Br) * VF)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + // The back-edge branch will remain, as will all scalar branches. + return TTI.getCFInstrCost(Instruction::Br); + else + // This branch will be eliminated by if-conversion. + return 0; } case Instruction::PHI: { auto *Phi = cast(I); Index: test/Analysis/CostModel/SystemZ/branch-predicated-vectorized-block.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/SystemZ/branch-predicated-vectorized-block.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s +; +; Check costs for branches inside a vectorized loop around predicated +; blocks. Each such branch will be guarded with an extractelement from the +; vector compare plus a test under mask instruction. This cost is modelled on +; the extractelement of i1. + +define void @fun(i64 *%ptr) { +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %for.body +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %0 = insertelement <2 x i64> undef, i64 %indvars.iv, i32 0 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %1 = insertelement <2 x i64> %0, i64 %indvars.iv, i32 1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp eq <2 x i64> %1, +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %E = extractelement <2 x i1> %cmp, i32 0 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br i1 %E, label %pred.store.0, label %loop1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %arrayidx = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store i64 0, i64* %arrayidx +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %loop1 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %E1 = extractelement <2 x i1> %cmp, i32 1 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br i1 %E1, label %pred.store.1, label %for.inc +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %arrayidx1 = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store i64 1, i64* %arrayidx1 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br label %for.inc +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %exitcond = icmp eq i64 %indvars.iv.next, 256 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: br i1 %exitcond, label %for.end.loopexit, label %for.body +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret void + +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %0 = insertelement <2 x i64> undef, i64 %indvars.iv, i32 0 + %1 = insertelement <2 x i64> %0, i64 %indvars.iv, i32 1 + %cmp = icmp eq <2 x i64> %1, + %E = extractelement <2 x i1> %cmp, i32 0 + br i1 %E, label %pred.store.0, label %loop1 + +pred.store.0: + %arrayidx = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv + store i64 0, i64* %arrayidx + br label %loop1 + +loop1: + %E1 = extractelement <2 x i1> %cmp, i32 1 + br i1 %E1, label %pred.store.1, label %for.inc + +pred.store.1: + %arrayidx1 = getelementptr inbounds i64, i64* %ptr, i64 %indvars.iv + store i64 1, i64* %arrayidx1 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + ret void +}