Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -384,11 +384,11 @@ void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each /// scalarized instruction behind an if block predicated on the control /// dependence of the instruction. virtual void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false); + bool IfPredicateInstr = false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -591,7 +591,7 @@ /// Store instructions that should be predicated, as a pair /// - SmallVector, 4> PredicatedStores; + SmallVector, 4> PredicatedInstructions; EdgeMaskCache MaskCache; /// Trip count of the original loop. Value *TripCount; @@ -625,7 +625,7 @@ private: void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false) override; + bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2675,8 +2675,11 @@ } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") + << *Instr << '\n'); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2720,7 +2723,7 @@ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); VectorParts Cond; - if (IfPredicateStore) { + if (IfPredicateInstr) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), @@ -2734,7 +2737,7 @@ // Start if-block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); @@ -2773,9 +2776,8 @@ VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back( - std::make_pair(cast(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } } @@ -3306,9 +3308,12 @@ return V; } -/// Estimate the overhead of scalarizing a value. Insert and Extract are set if -/// the result needs to be inserted and/or extracted from vectors. +/// \brief Estimate the overhead of scalarizing a value. Insert and Extract are +/// set if the result needs to be inserted and/or extracted from vectors. +/// If the instruction is also to be predicated, insertion cost is added the +/// cost of the PHI node required. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, + bool Predicate, const TargetTransformInfo &TTI) { if (Ty->isVoidTy()) return 0; @@ -3317,15 +3322,56 @@ unsigned Cost = 0; for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Insert) + if (Insert) { + if (Predicate) + Cost += TTI.getCFInstrCost(Instruction::PHI); Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); + } if (Extract) Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); } + // We assume that if-converted blocks have a 50% chance of being executed. + // Predicated scalarized instructions are avoided due to the CF that bypasses + // turned off lanes. The extracts and inserts will be sinked/hoisted to the + // predicated basic-block and are subjected to the same assumption. + if (Predicate) + Cost /= 2; + return Cost; } +/// \brief Estimate the overhead of scalarizing an instruction based on the +/// types of its operand and return value. +static unsigned getScalarizationOverhead(SmallVectorImpl& OpTys, + Type *RetTy, bool Predicate, + const TargetTransformInfo &TTI) { + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, Predicate, TTI); + + for (Type *Ty : OpTys) + ScalarizationCost += + getScalarizationOverhead(Ty, false, true, Predicate, TTI); + + return ScalarizationCost; +} + +/// \brief Estimate the overhead of scalarizing an instruction based on the +/// types of its operand and return value. +static unsigned getScalarizationOverhead(Instruction *I, Type* RetTy, + unsigned VF, bool Predicate, + const TargetTransformInfo &TTI) { + if (VF == 1) + return 0; + + SmallVector OpTys; + unsigned OperandsNum = I->getNumOperands(); + for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) + OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + + return getScalarizationOverhead(OpTys, RetTy, Predicate, TTI); +} + // Estimate cost of a call instruction CI if it were vectorized with factor VF. // Return the cost of the instruction, including scalarization overhead if it's // needed. The flag NeedToScalarize shows if the call needs to be scalarized - @@ -3356,10 +3402,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - for (Type *Ty : Tys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3779,15 +3822,61 @@ // Make sure DomTree is updated. updateAnalysis(); - // Predicate any stores. - for (auto KV : PredicatedStores) { + // Predicate any instructions. + for (auto KV : PredicatedInstructions) { BasicBlock::iterator I(KV.first); - auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + BasicBlock *Head = I->getParent(); + auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT, LI); I->moveBefore(T); - I->getParent()->setName("pred.store.if"); - BB->setName("pred.store.continue"); + // Try to move any extract-element we may have created for the predicated + // instruction into the Then block. + for (unsigned In = 0; In < I->getNumOperands(); ++In) { + ExtractElementInst *OpInst = + dyn_cast(I->getOperand(In)); + if (!OpInst) + continue; + bool CanSinkToUse = true; + for (User *U : OpInst->users()) { + if (U != &*I) { + CanSinkToUse = false; + break; + } + } + if (CanSinkToUse) + OpInst->moveBefore(&*I); + } + I->getParent()->setName(Twine("pred.") + Twine(I->getOpcodeName()) + + Twine(".if")); + BB->setName(Twine("pred.") + Twine(I->getOpcodeName()) + + Twine(".continue")); + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + auto Users = I->users(); + if (std::distance(Users.begin(), Users.end()) == 1 && + isa(*I->users().begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast(*Users.begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } } DEBUG(DT->verifyDomTree()); // Remove redundant induction instructions. @@ -4156,18 +4245,23 @@ continue; } // End of PHI. + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::FRem: + // Scalarize if block needs predication, otherwise fallthrough. + if (Legal->blockNeedsPredication(I.getParent())) { + scalarizeInstruction(&I, true); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: @@ -4998,17 +5092,6 @@ } if (I.mayThrow()) return false; - - // The instructions below can trap. - switch (I.getOpcode()) { - default: - continue; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return false; - } } return true; @@ -5781,7 +5864,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; - // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -5938,17 +6020,23 @@ // TODO: IF-converted IFs become selects. return 0; } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // We assume that if-converted blocks have a 50% chance of being executed. + // Predicated scalarized instructions are avoided due to the CF that + // bypasses turned off lanes. If we are not predicating, fallthrough. + if (VF > 1 && Legal->blockNeedsPredication(I->getParent())) + return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / 2 + + getScalarizationOverhead(I, VectorTy, VF, true, TTI); case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -6182,28 +6270,11 @@ return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); return CallCost; } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. - unsigned Cost = 0; - - if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); - unsigned ExtCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); - - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - } - + default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); - return Cost; - } + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + getScalarizationOverhead(I, VectorTy, VF, false, TTI); } // end of switch. } @@ -6307,7 +6378,7 @@ } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -6350,7 +6421,7 @@ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); VectorParts Cond; - if (IfPredicateStore) { + if (IfPredicateInstr) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), @@ -6363,7 +6434,7 @@ // Start an "if (pred) a[i] = ..." block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); @@ -6394,16 +6465,16 @@ VecResults[Part] = Cloned; // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back(std::make_pair(cast(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast(Instr); - bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - return scalarizeInstruction(Instr, IfPredicateStore); + return scalarizeInstruction(Instr, IfPredicateInstr); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -0,0 +1,43 @@ +; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Test predication of non-void instructions, specifically the creation of an +; insertelement and a Phi node. +define void @test(i32* nocapture %a) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %if.end + ret void + +; CHECK-LABEL: test +; CHECK: vector.body: +; CHECK: br i1 %{{.*}}, label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]] +; CHECK: [[cond]]: +; CHECK: %[[v0:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} +; CHECK: %[[v1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[v0]], i32 0 +; CHECK: br label %[[else]] +; CHECK: [[else]]: +; CHECK: %{{.*}} = phi <2 x i32> [ undef, %vector.body ], [ %[[v1]], %[[cond]] ] + +for.body: ; preds = %if.end, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 27 + %cmp1 = icmp slt i32 %0, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %div = sdiv i32 %add, %0 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %y.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] + store i32 %y.0, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -1,7 +1,6 @@ ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC -; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @@ -17,49 +16,27 @@ ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 ; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true -; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: +; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: ; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 ; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true -; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: +; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: -; VEC-IC-LABEL: test -; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, -; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, -; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0 -; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]] -; -; VEC-IC: [[cond]]: -; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0 -; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4 -; VEC-IC: br label %[[else:.+]] -; -; VEC-IC: [[else]]: -; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1 -; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]] -; -; VEC-IC: [[cond2]]: -; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1 -; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4 -; VEC-IC: br label %[[else2:.+]] -; -; VEC-IC: [[else2]]: - ; UNROLL-LABEL: test ; UNROLL: vector.body: ; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0