Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -367,8 +367,9 @@ /// See PR14725. void fixLCSSAPHIs(); - /// Predicate conditional stores on their respective conditions. - void predicateStores(); + /// Predicate conditional instructions that require predication on their + /// respective conditions. + void predicateInstructions(); /// Shrinks vector element sizes based on information in "MinBWs". void truncateToMinimalBitwidths(); @@ -395,11 +396,11 @@ void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each /// scalarized instruction behind an if block predicated on the control /// dependence of the instruction. virtual void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false); + bool IfPredicateInstr = false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -605,7 +606,7 @@ /// Store instructions that should be predicated, as a pair /// - SmallVector, 4> PredicatedStores; + SmallVector, 4> PredicatedInstructions; EdgeMaskCache MaskCache; /// Trip count of the original loop. Value *TripCount; @@ -635,7 +636,7 @@ private: void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false) override; + bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2746,8 +2747,11 @@ } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") << *Instr + << '\n'); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2791,7 +2795,7 @@ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); VectorParts Cond; - if (IfPredicateStore) { + if (IfPredicateInstr) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), @@ -2805,7 +2809,7 @@ // Start if-block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); @@ -2844,9 +2848,8 @@ VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back( - std::make_pair(cast(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } } @@ -3377,9 +3380,13 @@ return V; } -/// Estimate the overhead of scalarizing a value. Insert and Extract are set if -/// the result needs to be inserted and/or extracted from vectors. +/// \brief Estimate the overhead of scalarizing a value based on its type. +/// Insert and Extract are set if the result needs to be inserted and/or +/// extracted from vectors. +/// If the instruction is also to be predicated, add the cost of a PHI +/// node to the insertion cost. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, + bool Predicated, const TargetTransformInfo &TTI) { if (Ty->isVoidTy()) return 0; @@ -3388,15 +3395,58 @@ unsigned Cost = 0; for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Insert) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); if (Extract) Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); + if (Insert) { + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); + if (Predicated) + Cost += TTI.getCFInstrCost(Instruction::PHI); + } } + // We assume that if-converted blocks have a 50% chance of being executed. + // Predicated scalarized instructions are avoided due to the CF that bypasses + // turned off lanes. The extracts and inserts will be sinked/hoisted to the + // predicated basic-block and are subjected to the same assumption. + if (Predicated) + Cost /= 2; + return Cost; } +/// \brief Estimate the overhead of scalarizing an Instruction based on the +/// types of its operands and return value. +static unsigned getScalarizationOverhead(SmallVectorImpl &OpTys, + Type *RetTy, bool Predicated, + const TargetTransformInfo &TTI) { + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, Predicated, TTI); + + for (Type *Ty : OpTys) + ScalarizationCost += + getScalarizationOverhead(Ty, false, true, Predicated, TTI); + + return ScalarizationCost; +} + +/// \brief Estimate the overhead of scalarizing an instruction. This is a +/// convenience wrapper for the type-based getScalarizationOverhead API. +static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, + bool Predicated, + const TargetTransformInfo &TTI) { + if (VF == 1) + return 0; + + Type *RetTy = ToVectorTy(I->getType(), VF); + + SmallVector OpTys; + unsigned OperandsNum = I->getNumOperands(); + for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) + OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + + return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI); +} + // Estimate cost of a call instruction CI if it were vectorized with factor VF. // Return the cost of the instruction, including scalarization overhead if it's // needed. The flag NeedToScalarize shows if the call needs to be scalarized - @@ -3427,10 +3477,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - for (Type *Ty : Tys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3850,7 +3897,7 @@ // Make sure DomTree is updated. updateAnalysis(); - predicateStores(); + predicateInstructions(); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4017,17 +4064,93 @@ LoopMiddleBlock); } } - -void InnerLoopVectorizer::predicateStores() { - for (auto KV : PredicatedStores) { + +void InnerLoopVectorizer::predicateInstructions() { + + // For each instruction I marked for predication on value C, split I into its + // own basic block to form an if-then construct over C. + // Since I may be fed by extractelement and/or be feeding an insertelement + // generated during scalarization we try to move such instructions into the + // predicated basic block as well. For the insertelement this also means that + // the PHI will be created for the resulting vector rather than for the + // scalar instruction. So for some scalarized instruction, e.g. + // + // %34 = extractelement <2 x i32> %26, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 0 + // %36 = sdiv i32 %34, %35 + // %37 = insertelement <2 x i32> undef, i32 %36, i32 0 + // + // predication typically yields: + // + // %33 = icmp eq i1 %32, true + // br i1 %33, label %pred.sdiv.if, label %pred.sdiv.continue + // + // pred.sdiv.if: ; preds = %vector.body + // %34 = extractelement <2 x i32> %26, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 0 + // %36 = sdiv i32 %34, %35 + // %37 = insertelement <2 x i32> undef, i32 %36, i32 0 + // br label %pred.sdiv.continue + // + // pred.sdiv.continue: ; preds = %pred.sdiv.if, %vector.body + // %38 = phi <2 x i32> [ undef, %vector.body ], [ %37, %pred.sdiv.if ] + + for (auto KV : PredicatedInstructions) { BasicBlock::iterator I(KV.first); - auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + BasicBlock *Head = I->getParent(); + auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT, LI); I->moveBefore(T); - I->getParent()->setName("pred.store.if"); - BB->setName("pred.store.continue"); + // Try to move any extractelement we may have created for the predicated + // instruction into the Then block. + for (Use &Op : I->operands()) { + auto *OpInst = dyn_cast(&*Op); + if (!OpInst) + continue; + bool CanSinkToUse = true; + for (User *U : OpInst->users()) { + if (U != &*I) { + // The extractelement is feeding another instruction - give up. + CanSinkToUse = false; + break; + } + } + if (CanSinkToUse) + OpInst->moveBefore(&*I); + } + + I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); + BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); + + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + + if (I->hasOneUse() && isa(*I->user_begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast(*I->user_begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } } + DEBUG(DT->verifyDomTree()); } @@ -4215,6 +4338,24 @@ } } +/// A helper function for checking whether an integer division-related +/// instruction may divide by zero (in which case it must be predicated if +/// executed conditionally in the scalar code). +/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). +/// Non-zero divisors that are non compile-time constants will not be +/// converted into multiplication, so we will still end up scalarizing +/// the division, but can do so w/o predication. +static bool mayDivideByZero(Instruction &I) { + assert((I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::URem || + I.getOpcode() == Instruction::SRem) && + "Unexpected instruction"); + Value *Divisor = I.getOperand(1); + auto *CInt = dyn_cast(Divisor); + return !CInt || CInt->isZero(); +} + void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { @@ -4231,17 +4372,23 @@ continue; } // End of PHI. + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (mayDivideByZero(I) && Legal->blockNeedsPredication(I.getParent())) { + scalarizeInstruction(&I, true); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -5132,17 +5279,6 @@ } if (I.mayThrow()) return false; - - // The instructions below can trap. - switch (I.getOpcode()) { - default: - continue; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return false; - } } return true; @@ -5915,7 +6051,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; - // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -6059,17 +6194,24 @@ // TODO: IF-converted IFs become selects. return 0; } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // We assume that if-converted blocks have a 50% chance of being executed. + // Predicated scalarized instructions are avoided due to the CF that + // bypasses turned off lanes. If we are not predicating, fallthrough. + if (VF > 1 && mayDivideByZero(*I) && + Legal->blockNeedsPredication(I->getParent())) + return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / 2 + + getScalarizationOverhead(I, VF, true, TTI); case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -6303,28 +6445,11 @@ return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); return CallCost; } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. - unsigned Cost = 0; - - if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); - unsigned ExtCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); - - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - } - + default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); - return Cost; - } + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + getScalarizationOverhead(I, VF, false, TTI); } // end of switch. } @@ -6382,7 +6507,7 @@ } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -6425,7 +6550,7 @@ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); VectorParts Cond; - if (IfPredicateStore) { + if (IfPredicateInstr) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), @@ -6438,7 +6563,7 @@ // Start an "if (pred) a[i] = ..." block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); @@ -6469,16 +6594,16 @@ VecResults[Part] = Cloned; // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back(std::make_pair(cast(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast(Instr); - bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - return scalarizeInstruction(Instr, IfPredicateStore); + return scalarizeInstruction(Instr, IfPredicateInstr); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -0,0 +1,149 @@ +; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Test predication of non-void instructions, specifically (i) that these +; instructions permit vectorization and (ii) the creation of an insertelement +; and a Phi node. For each predicated instruction we search for the code +; generated for the first element. +define void @test(i32* nocapture %asd, i32* nocapture %aud, + i32* nocapture %asr, i32* nocapture %aur) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %if.end + ret void + +; CHECK-LABEL: test +; CHECK: vector.body: +; CHECK: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK: br i1 %{{.*}}, label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]] +; CHECK: [[CSD]]: +; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} +; CHECK: %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0 +; CHECK: br label %[[ESD]] +; CHECK: [[ESD]]: +; CHECK: %{{.*}} = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ] +; CHECK: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK: br i1 %{{.*}}, label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]] +; CHECK: [[CUD]]: +; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %{{.*}}, %{{.*}} +; CHECK: %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0 +; CHECK: br label %[[EUD]] +; CHECK: [[EUD]]: +; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ] +; CHECK: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK: br i1 %{{.*}}, label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]] +; CHECK: [[CSR]]: +; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %{{.*}}, %{{.*}} +; CHECK: %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0 +; CHECK: br label %[[ESR]] +; CHECK: [[ESR]]: +; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ] +; CHECK: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK: br i1 %{{.*}}, label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]] +; CHECK: [[CUR]]: +; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %{{.*}}, %{{.*}} +; CHECK: %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0 +; CHECK: br label %[[EUR]] +; CHECK: [[EUR]]: +; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ] + +for.body: ; preds = %if.end, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] + %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv + %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv + %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv + %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv + %lsd = load i32, i32* %isd, align 4 + %lud = load i32, i32* %iud, align 4 + %lsr = load i32, i32* %isr, align 4 + %lur = load i32, i32* %iur, align 4 + %psd = add nsw i32 %lsd, 23 + %pud = add nsw i32 %lud, 24 + %psr = add nsw i32 %lsr, 25 + %pur = add nsw i32 %lur, 26 + %cmp1 = icmp slt i32 %lsd, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %rsd = sdiv i32 %psd, %lsd + %rud = udiv i32 %pud, %lud + %rsr = srem i32 %psr, %lsr + %rur = urem i32 %pur, %lur + br label %if.end + +if.end: ; preds = %if.then, %for.body + %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ] + %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ] + %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ] + %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ] + store i32 %ysd.0, i32* %isd, align 4 + store i32 %yud.0, i32* %iud, align 4 + store i32 %ysr.0, i32* %isr, align 4 + store i32 %yur.0, i32* %iur, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare i32 @scalarized(i32 %a, i32 %b) + +; Future-use test for predication under smarter scalar-scalar: this test will +; fail when the vectorizer starts feeding scalarized values directly to their +; scalar users, i.e. w/o generating redundant insertelement/extractelement +; instructions. This case is already supported by the predication code (which +; should generate a phi for the scalar predicated value rather than for the +; insertelement), but cannot be tested yet. +; If you got this test to fail, fix the test by using the alternative FFU +; sequence to make this test check how we handle this case from now on. +define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %if.end + ret void + +; CHECK-LABEL: test_scalar2scalar +; CHECK: vector.body: +; CHECK: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] +; CHECK: [[THEN]]: +; CHECK: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} +; CHECK: %[[PDV:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[PD]], i32 0 +; CHECK: br label %[[FI]] +; CHECK: [[FI]]: +; CHECK: %[[PH:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[PDV]], %[[THEN]] ] +; FFU-LABEL: test_scalar2scalar +; FFU: vector.body: +; FFU: %{{.*}} = extractelement <2 x i1> %{{.*}}, i32 0 +; FFU: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] +; FFU: [[THEN]]: +; FFU: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} +; FFU: br label %[[FI]] +; FFU: [[FI]]: +; FFU: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ] + +for.body: ; preds = %if.end, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] + %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv + %lsd = load i32, i32* %isd, align 4 + %psd = add nsw i32 %lsd, 23 + %cmp1 = icmp slt i32 %lsd, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %rsd = sdiv i32 %psd, %lsd + br label %if.end + +if.end: ; preds = %if.then, %for.body + %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ] + %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv + %lsd.b = load i32, i32* %isd.b, align 4 + %z = sdiv i32 %lsd.b, %ysd.0 + store i32 %z, i32* %isd, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopVectorize/if-pred-not-when-safe.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-not-when-safe.ll +++ test/Transforms/LoopVectorize/if-pred-not-when-safe.ll @@ -0,0 +1,90 @@ +; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Test no-predication of instructions that are provably safe, e.g. dividing by +; a non-zero constant. +define void @test(i32* nocapture %asd, i32* nocapture %aud, + i32* nocapture %asr, i32* nocapture %aur, + i32* nocapture %asd0, i32* nocapture %aud0, + i32* nocapture %asr0, i32* nocapture %aur0 +) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %if.end + ret void + +; CHECK-LABEL: test +; CHECK: vector.body: +; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, +; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, +; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, +; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, +; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, +; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, +; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, +; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, + +for.body: ; preds = %if.end, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] + %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv + %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv + %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv + %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv + %lsd = load i32, i32* %isd, align 4 + %lud = load i32, i32* %iud, align 4 + %lsr = load i32, i32* %isr, align 4 + %lur = load i32, i32* %iur, align 4 + %psd = add nsw i32 %lsd, 23 + %pud = add nsw i32 %lud, 24 + %psr = add nsw i32 %lsr, 25 + %pur = add nsw i32 %lur, 26 + %isd0 = getelementptr inbounds i32, i32* %asd0, i64 %indvars.iv + %iud0 = getelementptr inbounds i32, i32* %aud0, i64 %indvars.iv + %isr0 = getelementptr inbounds i32, i32* %asr0, i64 %indvars.iv + %iur0 = getelementptr inbounds i32, i32* %aur0, i64 %indvars.iv + %lsd0 = load i32, i32* %isd0, align 4 + %lud0 = load i32, i32* %iud0, align 4 + %lsr0 = load i32, i32* %isr0, align 4 + %lur0 = load i32, i32* %iur0, align 4 + %psd0 = add nsw i32 %lsd, 27 + %pud0 = add nsw i32 %lud, 28 + %psr0 = add nsw i32 %lsr, 29 + %pur0 = add nsw i32 %lur, 30 + %cmp1 = icmp slt i32 %lsd, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %rsd = sdiv i32 %psd, 11 + %rud = udiv i32 %pud, 13 + %rsr = srem i32 %psr, 17 + %rur = urem i32 %pur, 19 + %rsd0 = sdiv i32 %psd0, 0 + %rud0 = udiv i32 %pud0, 0 + %rsr0 = srem i32 %psr0, 0 + %rur0 = urem i32 %pur0, 0 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ] + %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ] + %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ] + %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ] + %ysd0.0 = phi i32 [ %rsd0, %if.then ], [ %psd0, %for.body ] + %yud0.0 = phi i32 [ %rud0, %if.then ], [ %pud0, %for.body ] + %ysr0.0 = phi i32 [ %rsr0, %if.then ], [ %psr0, %for.body ] + %yur0.0 = phi i32 [ %rur0, %if.then ], [ %pur0, %for.body ] + store i32 %ysd.0, i32* %isd, align 4 + store i32 %yud.0, i32* %iud, align 4 + store i32 %ysr.0, i32* %isr, align 4 + store i32 %yur.0, i32* %iur, align 4 + store i32 %ysd0.0, i32* %isd0, align 4 + store i32 %yud0.0, i32* %iud0, align 4 + store i32 %ysr0.0, i32* %isr0, align 4 + store i32 %yur0.0, i32* %iur0, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -1,7 +1,6 @@ ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC -; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @@ -17,49 +16,27 @@ ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 ; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true -; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: +; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: ; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 ; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true -; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: +; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: -; VEC-IC-LABEL: test -; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, -; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, -; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0 -; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]] -; -; VEC-IC: [[cond]]: -; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0 -; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4 -; VEC-IC: br label %[[else:.+]] -; -; VEC-IC: [[else]]: -; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1 -; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]] -; -; VEC-IC: [[cond2]]: -; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1 -; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4 -; VEC-IC: br label %[[else2:.+]] -; -; VEC-IC: [[else2]]: - ; UNROLL-LABEL: test ; UNROLL: vector.body: ; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0