Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -518,6 +518,10 @@ /// induction variable will first be truncated to the corresponding type. void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Returns true if an instruction \p I should be scalarized instead of + /// vectorized for the chosen vectorization factor. + bool shouldScalarizeInstruction(Instruction *I) const; + /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; @@ -1907,6 +1911,15 @@ return MinBWs; } + /// \returns True if it is more profitable to scalarize instruction \p I for + /// vectorization factor \p VF. + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + auto Scalars = InstsToScalarize.find(VF); + assert(Scalars != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return Scalars->second.count(I); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1949,6 +1962,29 @@ /// to this type. MapVector MinBWs; + /// A type representing the costs for instructions if they were to be + /// scalarized rather than vectorized. The entries are Instruction-Cost + /// pairs. + typedef DenseMap ScalarCostsTy; + + /// A map holding scalar costs for different vectorization factors. The + /// presence of a cost for an instruction in the mapping indicates that the + /// instruction will be scalarized when vectorizing with the associated + /// vectorization factor. The entries are VF-ScalarCostTy pairs. + DenseMap InstsToScalarize; + + /// Returns the expected difference in cost from scalarizing the expression + /// feeding a predicated instruction \p PredInst. The instructions to + /// scalarize and their scalar costs are collected in \p ScalarCosts. A + /// non-negative return value implies the expression will be scalarized. + /// Currently, only single-use chains are considered for scalarization. + int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + unsigned VF); + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2183,12 +2219,17 @@ VecInd->addIncoming(LastInduction, LoopVectorLatch); } +bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { + return Legal->isScalarAfterVectorization(I) || + Cost->isProfitableToScalarize(I, VF); +} + bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { - if (Legal->isScalarAfterVectorization(IV)) + if (shouldScalarizeInstruction(IV)) return true; auto isScalarInst = [&](User *U) -> bool { auto *I = cast(U); - return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I)); + return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); }; return any_of(IV->users(), isScalarInst); } @@ -2229,7 +2270,7 @@ // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && IV->getType() == Induction->getType() && Step && - !Legal->isScalarAfterVectorization(EntryVal)) { + !shouldScalarizeInstruction(EntryVal)) { createVectorIntInductionPHI(ID, EntryVal); VectorizedIV = true; } @@ -4648,10 +4689,11 @@ continue; // Scalarize instructions that should remain scalar after vectorization. - if (!(isa(&I) || isa(&I) || + if (VF > 1 && + !(isa(&I) || isa(&I) || isa(&I)) && - Legal->isScalarAfterVectorization(&I)) { - scalarizeInstruction(&I); + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); continue; } @@ -6124,6 +6166,7 @@ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + collectInstsToScalarize(UserVF); return Factor; } @@ -6530,10 +6573,160 @@ return RUs; } +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { + + // If we aren't vectorizing the loop, or if we've already collected the + // instructions to scalarize, there's nothing to do. Collection may already + // have occurred if we have a user-selected VF and are now computing the + // expected cost for interleaving. + if (VF < 2 || InstsToScalarize.count(VF)) + return; + + // Initialize a mapping for VF in InstsToScalalarize. If we find that it's + // not profitable to scalarize any instructions, the presence of VF in the + // map will indicate that we've analyzed it already. + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + + // Find all the instructions that are scalar with predication in the loop and + // determine if it would be better to not if-convert the blocks they are in. + // If so, we also record the instructions to scalarize. + for (BasicBlock *BB : TheLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + for (Instruction &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + ScalarCostsTy ScalarCosts; + if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } +} + +int LoopVectorizationCostModel::computePredInstDiscount( + Instruction *PredInst, DenseMap &ScalarCosts, + unsigned VF) { + + assert(!Legal->isUniformAfterVectorization(PredInst) && + "Instruction marked uniform-after-vectorization will be predicated"); + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + // Holds instructions to analyze. The instructions we visit are mapped in + // ScalarCosts. Those instructions are the ones that would be scalarized if + // we find that the scalar version costs less. + SmallVector Worklist; + + // Returns true if the given instruction can be scalarized. + auto canBeScalarized = [&](Instruction *I) -> bool { + + // We only attempt to scalarize instructions forming a single-use chain + // from the original predicated block that would otherwise be vectorized. + // Although not strictly necessary, we give up on instructions we know will + // already be scalar to avoid traversing chains that are unlikely to be + // beneficial. + if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || + Legal->isScalarAfterVectorization(I)) + return false; + + // If the instruction is scalar with predication, it will be analyzed + // separately. We ignore it within the context of PredInst. + if (Legal->isScalarWithPredication(I)) + return false; + + // If any of the instruction's operands are uniform after vectorization, + // the instruction cannot be scalarized. This prevents, for example, a + // masked load from being scalarized. + // + // We assume we will only emit a value for lane zero of an instruction + // marked uniform after vectorization, rather than VF identical values. + // Thus, if we scalarize an instruction that uses a uniform, we would + // create uses of values corresponding to the lanes we aren't emitting code + // for. This behavior can be changed by allowing getScalarValue to clone + // the lane zero values for uniforms rather than asserting. + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) + if (Legal->isUniformAfterVectorization(J)) + return false; + + // Otherwise, we can scalarize the instruction. + return true; + }; + + // Returns true if an operand that cannot be scalarized must be extracted + // from a vector. We will account for this scalarization overhead below. Note + // that the non-void predicated instructions are placed in their own blocks, + // and their return values are inserted into vectors. Thus, an extract would + // still be required. + auto needsExtract = [&](Instruction *I) -> bool { + return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + }; + + // Compute the expected cost discount from scalarizing the entire expression + // feeding the predicated instruction. We currently only consider expressions + // that are single-use instruction chains. + Worklist.push_back(PredInst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + + // If we've already analyzed the instruction, there's nothing to do. + if (ScalarCosts.count(I)) + continue; + + // Compute the cost of the vector instruction. Note that this cost already + // includes the scalarization overhead of the predicated instruction. + unsigned VectorCost = getInstructionCost(I, VF).first; + + // Compute the cost of the scalarized instruction. This cost is the cost of + // the instruction as if it wasn't if-converted and instead remained in the + // predicated block. We will scale this cost by block probability after + // computing the scalarization overhead. + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + + // Compute the scalarization overhead of needed insertelement instructions + // and phi nodes. + if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, + false, TTI); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + } + + // Compute the scalarization overhead of needed extractelement + // instructions. For each of the instruction's operands, if the operand can + // be scalarized, add it to the worklist; otherwise, account for the + // overhead. + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) { + assert(VectorType::isValidElementType(J->getType()) && + "Instruction has non-scalar type"); + if (canBeScalarized(J)) + Worklist.push_back(J); + else if (needsExtract(J)) + ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), + false, true, TTI); + } + + // Scale the total scalar cost by block probability. + ScalarCost /= getReciprocalPredBlockProb(); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -6641,6 +6834,9 @@ if (Legal->isUniformAfterVectorization(I)) VF = 1; + if (VF > 1 && isProfitableToScalarize(I, VF)) + return VectorizationCostTy(InstsToScalarize[VF][I], false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); @@ -7007,7 +7203,14 @@ VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - // Insert values known to be scalar into VecValuesToIgnore. + // Insert values known to be scalar into VecValuesToIgnore. This is a + // conservative estimation of the values that will later be scalarized. + // + // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may + // still be scalarized. For example, we may find an instruction to be + // more profitable for a given vectorization factor if it were to be + // scalarized. But at this point, we haven't yet computed the + // vectorization factor. for (auto *BB : TheLoop->getBlocks()) for (auto &I : *BB) if (Legal->isScalarAfterVectorization(&I)) Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s +; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; +; This test checks that we correctly compute the scalarized operands for a +; user-specified vectorization factor when interleaving is disabled. We use the +; "optsize" attribute to disable all interleaving calculations. +; +; CHECK: vector.body: +; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4 +; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0 +; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0 +; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x +; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]] +; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0 +; CHECK: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ] +; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1 +; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1 +; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x +; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]] +; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1 +; CHECK: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i + %tmp2 = load i64, i64* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i64 %tmp2, %x + %tmp4 = udiv i64 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i64 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, 100 + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i64 [ %tmp6, %for.inc ] + ret i64 %tmp7 +} Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -57,9 +57,9 @@ ; as: ; ; Cost of store: -; (store(4) + extractelement(6)) / 2 = 5 +; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { @@ -85,3 +85,147 @@ for.end: ret void } + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated udiv +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of udiv: +; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 +; +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; +define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i32 %tmp2, %x + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} + +; CHECK-LABEL: predicated_store_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated store +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of store: +; store(4) / 2 = 2 +; +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; +define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = add nsw i32 %tmp1, %x + store i32 %tmp2, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: predication_multi_context +; +; This test checks that we correctly compute the cost of multiple predicated +; instructions in the same block. The sdiv, udiv, and store must be scalarized +; and predicated. The sub feeding the store is scalarized and sunk inside the +; store's predicated block. However, the add feeding the sdiv and udiv cannot +; be sunk and is not scalarized. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; add(1) = 1 +; Cost of sdiv: +; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; Cost of udiv: +; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; Cost of sub: +; (sub(2) + extractelement(3)) / 2 = 2 +; Cost of store: +; store(4) / 2 = 2 +; +; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x +; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x +; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 +; +define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = add i32 %tmp1, %x + %tmp3 = sdiv i32 %tmp1, %tmp2 + %tmp4 = udiv i32 %tmp3, %tmp2 + %tmp5 = sub i32 %tmp4, %x + store i32 %tmp5, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; CHECK-LABEL: predicated_sdiv_masked_load +; +; This test ensures that we don't scalarize the predicated load. Since the load +; can be vectorized with predication, scalarizing it would cause its pointer +; operand to become non-uniform. +; +; CHECK: vector.body: +; CHECK: %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32 +; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK: %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0 +; CHECK: %[[T1:.+]] = sdiv i32 %[[T0]], %x +; CHECK: %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0 +; CHECK: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK: %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ] +; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK: %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1 +; CHECK: %[[T5:.+]] = sdiv i32 %[[T4]], %x +; CHECK: %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1 +; CHECK: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK: phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp3 = load i32, i32* %tmp2, align 4 + %tmp4 = sdiv i32 %tmp3, %x + %tmp5 = add nsw i32 %tmp4, %tmp1 + br label %for.inc + +for.inc: + %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then] + %tmp7 = add i32 %r, %tmp6 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 10000 + br i1 %cond, label %for.end, label %for.body + +for.end: + %tmp8 = phi i32 [ %tmp7, %for.inc ] + ret i32 %tmp8 +} Index: llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -207,3 +207,57 @@ %exitcond = icmp eq i64 %indvars.iv.next, 128 br i1 %exitcond, label %for.cond.cleanup, label %for.body } + + +define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; CHECK: vector.body: +; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x +; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]] +; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0 +; CHECK: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK: %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ] +; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x +; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]] +; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1 +; CHECK: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK: phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i32 %tmp2, %x + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} Index: llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll +++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -12,7 +12,6 @@ ; VEC-LABEL: test ; VEC: %[[v0:.+]] = add i64 %index, 0 ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, -; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]] ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[o1]], i32 0 @@ -20,9 +19,10 @@ ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: -; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; VEC: %[[v9a:.+]] = add nsw i32 %[[v13]], 20 ; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] -; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 +; VEC: store i32 %[[v9a]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: @@ -31,10 +31,11 @@ ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: -; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; VEC: %[[v9b:.+]] = add nsw i32 %[[v17]], 20 ; VEC: %[[v1:.+]] = add i64 %index, 1 ; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] -; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 +; VEC: store i32 %[[v9b]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: