Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1908,6 +1908,15 @@ return MinBWs; } + /// \returns True if it is more profitable to scalarize instruction \p I for + /// vectorization factor \p VF. + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + auto Scalars = InstsToScalarize.find(VF); + if (Scalars == InstsToScalarize.end()) + return false; + return Scalars->second.count(I); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1950,6 +1959,23 @@ /// to this type. MapVector MinBWs; + /// A map holding instruction-cost pairs for each choice of vectorization + /// factor. The presence of an instruction in the mapping indicates that it + /// will be scalarized when vectorizing with the associated vectorization + /// factor. + DenseMap> InstsToScalarize; + + // Returns the expected difference in cost from scalarizing the expression + // feeding a predicated instruction \p PredInst. The instructions to + // scalarize and their scalar costs are collected in \p ScalarCosts. + int computePredInstDiscount(Instruction *PredInst, + DenseMap &ScalarCosts, + unsigned VF); + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + public: /// The loop that we evaluate. Loop *TheLoop; @@ -4649,8 +4675,9 @@ // Scalarize instructions that should remain scalar after vectorization. if (!(isa(&I) || isa(&I) || isa(&I)) && - Legal->isScalarAfterVectorization(&I)) { - scalarizeInstruction(&I); + (Legal->isScalarAfterVectorization(&I) || + Cost->isProfitableToScalarize(&I, VF))) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); continue; } @@ -6474,10 +6501,101 @@ return RUs; } +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { + + // If we aren't vectorizing the loop, there's nothing to scalarize. + if (VF < 2) + return; + + // Find all the instructions that are scalar with predication in the loop and + // determine if it would better to not if-convert the blocks they are in. If + // so, we also record the instructions to scalarize. + for (auto *BB : TheLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + for (auto &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + DenseMap ScalarCosts; + if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + InstsToScalarize[VF].insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } +} + +int LoopVectorizationCostModel::computePredInstDiscount( + Instruction *PredInst, DenseMap &ScalarCosts, + unsigned VF) { + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + // Holds instructions to analyze. The instructions we visit are mapped in + // ScalarCosts. Those instructions are the ones that would be scalarized if + // we find that the scalar version costs less. + SmallVector Worklist; + + // Compute the expected cost discount from scalarizing the entire expression + // feeding the predicated instruction. + Worklist.push_back(PredInst); + while (!Worklist.empty()) { + auto *I = Worklist.pop_back_val(); + + // If the instruction is loop invariant or we've already analyzed it, + // there's nothing to do. + if (ScalarCosts.count(I) || !TheLoop->contains(I)) + continue; + + // Compute the cost of the vector instruction. Note that this cost already + // includes the scalarization overhead of the predicated instruction. + unsigned VectorCost = getInstructionCost(I, VF).first; + + // Compute the cost of the scalarized instruction. This cost is the cost of + // the instruction as if it wasn't if-converted and instead remained in the + // predicated block. We will scale this cost by block probability after + // computing the scalarization overhead. + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + + // Compute the scalarization overhead of needed insertelement instructions. + if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) + ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, + false, TTI); + + // Compute the scalarization overhead of needed extractelement + // instructions. We will attempt to scalarize all instructions in the + // predicated block. Thus, if the instruction uses an instruction defined + // outside the predicated block, we will have to include the cost of + // extracting it. If the instruction uses an instruction defined inside the + // predicated block, we add it to the worklist instead. + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) { + if (PredInst->getParent() != J->getParent()) + ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), + false, true, TTI); + else + Worklist.push_back(J); + } + + // Scale the total scalar cost by block probability. + ScalarCost /= getReciprocalPredBlockProb(); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -6585,6 +6703,9 @@ if (Legal->isUniformAfterVectorization(I)) VF = 1; + if (isProfitableToScalarize(I, VF)) + return VectorizationCostTy(InstsToScalarize[VF][I], false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -85,3 +85,89 @@ for.end: ret void } + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated udiv +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of udiv: +; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 +; +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; +define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i32 %tmp2, %x + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} + +; CHECK-LABEL: predicated_store_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated store +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of udiv: +; (store(4) + extractelement(3)) / 2 = 3 +; +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; +define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = add nsw i32 %tmp1, %x + store i32 %tmp2, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -207,3 +207,57 @@ %exitcond = icmp eq i64 %indvars.iv.next, 128 br i1 %exitcond, label %for.cond.cleanup, label %for.body } + + +define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; CHECK: vector.body: +; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x +; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]] +; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0 +; CHECK: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK: %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ] +; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x +; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]] +; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1 +; CHECK: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK: phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i32 %tmp2, %x + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -12,7 +12,6 @@ ; VEC-LABEL: test ; VEC: %[[v0:.+]] = add i64 %index, 0 ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, -; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]] ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[o1]], i32 0 @@ -20,9 +19,10 @@ ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: -; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; VEC: %[[v9a:.+]] = add nsw i32 %[[v13]], 20 ; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] -; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 +; VEC: store i32 %[[v9a]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: @@ -31,10 +31,11 @@ ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: -; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; VEC: %[[v9b:.+]] = add nsw i32 %[[v17]], 20 ; VEC: %[[v1:.+]] = add i64 %index, 1 ; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] -; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 +; VEC: store i32 %[[v9b]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: