Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1425,9 +1425,17 @@ /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if this instruction will remain scalar after vectorization. + /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I) { + if (Uniforms.count(I) || !TheLoop->contains(I)) + return true; + auto *GEP = dyn_cast(I); + return GEP && !VectorGEPs.count(GEP); + } + /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1480,6 +1488,17 @@ bool isLegalMaskedGather(Type *DataType) { return TTI->isLegalMaskedGather(DataType); } + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + auto *LI = dyn_cast(V); + auto *SI = dyn_cast(V); + if (!LI && !SI) + return false; + auto *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + auto *Ty = cast(Ptr->getType())->getElementType(); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -1504,9 +1523,26 @@ /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the variables that need to stay uniform after vectorization. + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include scalar induction variables and pointer + /// operands of consecutive or interleaved memory accesses. Note that + /// although uniformity implies an instruction will be scalar, the reverse is + /// not true. In general, a scalarized instruction will be represented by VF + /// scalar values in the vectorized loop, each corresponding to an iteration + /// of the original scalar loop. void collectLoopUniforms(); + /// Collect the getelementptr instructions that will be vectorized. A + /// getelementptr instruction is only vectorized if it is used for a legal + /// gather or scatter operation. + void collectVectorGEPs(); + + /// Returns the uniform pointer for \p I if \p I is uniform or has a uniform + /// pointer operand. + Instruction *getUniformPtr(Instruction *I); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1587,6 +1623,9 @@ /// vectorization. SmallPtrSet Uniforms; + /// Holds the getelementptr instructions that will be vectorized. + SmallPtrSet VectorGEPs; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -4539,9 +4578,6 @@ return false; } - // Collect all of the variables that remain uniform after vectorization. - collectLoopUniforms(); - DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need ? " (with a runtime bound check)" @@ -4558,6 +4594,12 @@ if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); + // Collect all getelementptr instructions that will be vectorized. + collectVectorGEPs(); + + // Collect all instructions that are known to be uniform after vectorization. + collectLoopUniforms(); + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; @@ -4823,6 +4865,45 @@ return true; } +/// \brief Return the pointer operand of a load or store instruction. +static Value *getPointerOperand(Value *I) { + if (auto *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (auto *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +Instruction *LoopVectorizationLegality::getUniformPtr(Instruction *I) { + + // If the instruction is a consecutive pointer it will be uniform after + // vectorization. In this case, return the instruction. + if (isa(I->getType()) && isConsecutivePtr(I)) + return I; + + // If the instruction is an interleaved access, the pointer operand will be + // treated as if it were a consecutive pointer. + if (isAccessInterleaved(I)) + return cast(getPointerOperand(I)); + + // Otherwise, assume the instruction has no uniform pointer. + return nullptr; +} + +void LoopVectorizationLegality::collectVectorGEPs() { + for (auto *BB : TheLoop->blocks()) + for (auto &I : *BB) { + auto *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + auto *GEP = getGEPInstruction(Ptr); + if (GEP && isLegalGatherOrScatter(&I)) { + VectorGEPs.insert(GEP); + DEBUG(dbgs() << "LV: GEP will be vectorized: " << I << "\n"); + } + } +} + void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. @@ -4843,16 +4924,14 @@ DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } - // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup). - for (auto *BB : TheLoop->blocks()) { + // Also add all uniform pointer values. + for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) { - Worklist.insert(&I); - DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n"); + if (auto *Ptr = getUniformPtr(&I)) { + Worklist.insert(Ptr); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ptr << "\n"); } } - } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be either already inside Worklist, or @@ -4875,6 +4954,8 @@ } } while (idx != Worklist.size()); + Uniforms.insert(Worklist.begin(), Worklist.end()); + // For an instruction to be added into Worklist above, all its users inside // the current loop should be already added into Worklist. This condition // cannot be true for phi instructions which is always in a dependence loop. @@ -4887,21 +4968,19 @@ auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); if (all_of(PN->users(), [&](User *U) -> bool { - return U == UpdateV || isOutOfScope(U) || - Worklist.count(cast(U)); + auto *I = dyn_cast(U); + return I && (I == UpdateV || isScalarAfterVectorization(I)); }) && all_of(UpdateV->users(), [&](User *U) -> bool { - return U == PN || isOutOfScope(U) || - Worklist.count(cast(U)); + auto *I = dyn_cast(U); + return I && (I == PN || isScalarAfterVectorization(I)); })) { - Worklist.insert(cast(PN)); - Worklist.insert(cast(UpdateV)); + Uniforms.insert(cast(PN)); + Uniforms.insert(cast(UpdateV)); DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); } } - - Uniforms.insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -5821,19 +5900,6 @@ return Cost; } -/// \brief Check if the load/store instruction \p I may be translated into -/// gather/scatter during vectorization. -/// -/// Pointer \p Ptr specifies address in memory for the given scalar memory -/// instruction. We need it to retrieve data type. -/// Using gather/scatter is possible when it is supported by target. -static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, - LoopVectorizationLegality *Legal) { - auto *DataTy = cast(Ptr->getType())->getElementType(); - return (isa(I) && Legal->isLegalMaskedGather(DataTy)) || - (isa(I) && Legal->isLegalMaskedScatter(DataTy)); -} - /// \brief Check whether the address computation for a non-consecutive memory /// access looks like an unlikely candidate for being merged into the indexing /// mode. @@ -6081,7 +6147,7 @@ // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool UseGatherOrScatter = - (ConsecutiveStride == 0) && isGatherOrScatterLegal(I, Ptr, Legal); + (ConsecutiveStride == 0) && Legal->isLegalGatherOrScatter(I); bool Reverse = ConsecutiveStride < 0; const DataLayout &DL = I->getModule()->getDataLayout(); @@ -6244,16 +6310,6 @@ return false; } -/// Take the pointer operand from the Load/Store instruction. -/// Returns NULL if this is not a valid Load/Store instruction. -static Value *getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return nullptr; -} - void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); @@ -6267,43 +6323,10 @@ } // Insert uniform instruction into VecValuesToIgnore. - // Collect non-gather/scatter and non-consecutive ptr in NonConsecutivePtr. - SmallPtrSet NonConsecutivePtr; - for (auto *BB : TheLoop->getBlocks()) { - for (auto &I : *BB) { + for (auto *BB : TheLoop->getBlocks()) + for (auto &I : *BB) if (Legal->isUniformAfterVectorization(&I)) VecValuesToIgnore.insert(&I); - Instruction *PI = dyn_cast_or_null(getPointerOperand(&I)); - if (PI && !Legal->isConsecutivePtr(PI) && - !isGatherOrScatterLegal(&I, PI, Legal)) - NonConsecutivePtr.insert(PI); - } - } - - // Ignore induction phis that are either used in uniform instructions or - // NonConsecutivePtr. - for (auto &Induction : *Legal->getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); - - if (std::all_of(PN->user_begin(), PN->user_end(), - [&](User *U) -> bool { - Instruction *UI = dyn_cast(U); - return U == UpdateV || !TheLoop->contains(UI) || - Legal->isUniformAfterVectorization(UI) || - NonConsecutivePtr.count(UI); - }) && - std::all_of(UpdateV->user_begin(), UpdateV->user_end(), - [&](User *U) -> bool { - Instruction *UI = dyn_cast(U); - return U == PN || !TheLoop->contains(UI) || - Legal->isUniformAfterVectorization(UI) || - NonConsecutivePtr.count(UI); - })) { - VecValuesToIgnore.insert(PN); - VecValuesToIgnore.insert(UpdateV); - } - } } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -248,6 +248,51 @@ ret void } +; Make sure we scalarize the step vectors used for the pointer arithmetic. We +; can't easily simplify vectorized step vectors. (Interleaved accesses.) +; +; for (int i = 0; i < n; ++i) +; p[i].f = a[i * 4] +; +; INTERLEAVE-LABEL: @scalarize_induction_variable_04( +; INTERLEAVE: vector.body: +; INTERLEAVE: %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; INTERLEAVE: %[[i1:.+]] = or i64 %[[i0]], 1 +; INTERLEAVE: %[[i2:.+]] = or i64 %[[i0]], 2 +; INTERLEAVE: %[[i3:.+]] = or i64 %[[i0]], 3 +; INTERLEAVE: %[[i4:.+]] = or i64 %[[i0]], 4 +; INTERLEAVE: %[[i5:.+]] = or i64 %[[i0]], 5 +; INTERLEAVE: %[[i6:.+]] = or i64 %[[i0]], 6 +; INTERLEAVE: %[[i7:.+]] = or i64 %[[i0]], 7 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1 + +define void @scalarize_induction_variable_04(i32* %a, %pair* %p, i32 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry] + %0 = shl nsw i64 %i, 2 + %1 = getelementptr inbounds i32, i32* %a, i64 %0 + %2 = load i32, i32* %1, align 1 + %3 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 + store i32 %2, i32* %3, align 1 + %i.next = add nuw nsw i64 %i, 1 + %4 = trunc i64 %i.next to i32 + %cond = icmp eq i32 %4, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + ; Make sure that the loop exit count computation does not overflow for i8 and ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the ; induction variable to a bigger type the exit count computation will overflow