Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -324,8 +324,9 @@ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), - VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} + OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). // MinimumBitWidths maps scalar integer values to the smallest bitwidth they @@ -352,11 +353,18 @@ protected: /// A small list of PHINodes. typedef SmallVector PhiVector; - /// When we unroll loops we have multiple vector values for each scalar. - /// This data structure holds the unrolled and vectorized values that - /// originated from one scalar instruction. + + /// A type for vectorized values in the new loop. Each value from the + /// original loop, when vectorized, is represented by UF vector values in the + /// new unrolled loop, where UF is the unroll factor. typedef SmallVector VectorParts; + /// A type for scalarized values in the new loop. Each value from the + /// original loop, when scalarized, is represented by UF x VF scalar values + /// in the new unrolled loop, where UF is the unroll factor and VF is the + /// vectorization factor. + typedef SmallVector, 2> ScalarParts; + // When we if-convert we need to create edge masks. We have to cache values // so that we don't end up with exponential recursion/IR. typedef DenseMap, VectorParts> @@ -461,12 +469,21 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. - VectorParts &getVectorValue(Value *V); + /// Return a constant reference to the VectorParts corresponding to \p V from + /// the original loop. If the value has already been vectorized, the + /// corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// new vector values on-demand by inserting the scalar values into vectors + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into vectors. + const VectorParts &getVectorValue(Value *V); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -509,44 +526,112 @@ /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); - /// This is a helper class that holds the vectorizer state. It maps scalar - /// instructions to vector instructions. When the code is 'unrolled' then - /// then a single scalar value is mapped to multiple vector parts. The parts - /// are stored in the VectorPart type. + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with initVector and initScalar, which + /// initialize and return a constant reference to the new entry. If a + /// non-constant reference to a vector entry is required, getVector can be + /// used to retrieve a mutable entry. We currently directly modify the mapped + /// values during "fix-up" operations that occur once the first phase of + /// widening is complete. These operations include type truncation and the + /// second phase of recurrence widening. + /// + /// Otherwise, entries from either map should be accessed using the + /// getVectorValue or getScalarValue functions from InnerLoopVectorizer. + /// getVectorValue and getScalarValue coordinate to generate a vector or + /// scalar value on-demand if one is not yet available. When vectorizing a + /// loop, we visit the definition of an instruction before its uses. When + /// visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. struct ValueMap { - /// C'tor. UnrollFactor controls the number of vectors ('parts') that - /// are mapped. - ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} - - /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) const { return MapStorage.count(Key); } - - /// Initializes a new entry in the map. Sets all of the vector parts to the - /// save value in 'Val'. - /// \return A reference to a vector with splat values. - VectorParts &splat(Value *Key, Value *Val) { - VectorParts &Entry = MapStorage[Key]; - Entry.assign(UF, Val); - return Entry; - } - - ///\return A reference to the value that is stored at 'Key'. - VectorParts &get(Value *Key) { - VectorParts &Entry = MapStorage[Key]; - if (Entry.empty()) - Entry.resize(UF); - assert(Entry.size() == UF); - return Entry; - } + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UnrollFactor, unsigned VecWidth) + : UF(UnrollFactor), VF(VecWidth) { + // The unroll and vectorization factors are only used in asserts builds + // to verify map entries are sized appropriately. + (void)UF; + (void)VF; + } + + /// \return True if the map has a vector entry for \p Key. + bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } + + /// \return True if the map has a scalar entry for \p Key. + bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } + + /// \brief Map \p Key to the given VectorParts \p Entry, and return a + /// constant reference to the new vector map entry. The given key should + /// not already be in the map, and the given VectorParts should be + /// correctly sized for the current unroll factor. + const VectorParts &initVector(Value *Key, const VectorParts &Entry) { + assert(!hasVector(Key) && "Vector entry already initialized"); + assert(Entry.size() == UF && "VectorParts has wrong dimensions"); + VectorMapStorage[Key] = Entry; + return VectorMapStorage[Key]; + } + + /// \brief Map \p Key to the given ScalarParts \p Entry, and return a + /// constant reference to the new scalar map entry. The given key should + /// not already be in the map, and the given ScalarParts should be + /// correctly sized for the current unroll and vectorization factors. + const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) { + assert(!hasScalar(Key) && "Scalar entry already initialized"); + assert(Entry.size() == UF && + all_of(make_range(Entry.begin(), Entry.end()), + [&](const SmallVectorImpl &Values) -> bool { + return Values.size() == VF; + }) && + "ScalarParts has wrong dimensions"); + ScalarMapStorage[Key] = Entry; + return ScalarMapStorage[Key]; + } + + /// \return A reference to the vector map entry corresponding to \p Key. + /// The key should already be in the map. This function should only be used + /// when it's necessary to update values that have already been vectorized. + /// This is the case for "fix-up" operations including type truncation and + /// the second phase of recurrence vectorization. If a non-const reference + /// isn't required, getVectorValue should be used instead. + VectorParts &getVector(Value *Key) { + assert(hasVector(Key) && "Vector entry not initialized"); + return VectorMapStorage.find(Key)->second; + } + + /// Retrieve an entry from the vector or scalar maps. The preferred way to + /// access an existing mapped entry is with getVectorValue or + /// getScalarValue from InnerLoopVectorizer. Until those functions can be + /// moved inside ValueMap, we have to declare them as friends. + friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V); + friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane); private: - /// The unroll factor. Each entry in the map stores this number of vector - /// elements. + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. unsigned UF; - /// Map storage. We use std::map and not DenseMap because insertions to a - /// dense map invalidates its iterators. - std::map MapStorage; + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + std::map VectorMapStorage; + std::map ScalarMapStorage; }; /// The original loop. @@ -610,18 +695,12 @@ PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Maps scalars to widened vectors. - ValueMap WidenMap; - /// A map of induction variables from the original loop to their - /// corresponding VF * UF scalarized values in the vectorized loop. The - /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap - /// maps original loop values to their vector versions in the new loop, - /// ScalarIVMap maps induction variables from the original loop that are not - /// vectorized to their scalar equivalents in the vector loop. Maintaining a - /// separate map for scalarized induction variables allows us to avoid - /// unnecessary scalar-to-vector-to-scalar conversions. - DenseMap> ScalarIVMap; + /// Maps values from the original loop to their corresponding values in the + /// vectorized loop. A key value can map to either vector values, scalar + /// values or both kinds of values, depending on whether the key was + /// vectorized and scalarized. + ValueMap VectorLoopValueMap; /// Store instructions that should be predicated, as a pair /// @@ -2165,14 +2244,18 @@ assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalar steps and save the results in ScalarIVMap. - for (unsigned Part = 0; Part < UF; ++Part) - for (unsigned I = 0; I < VF; ++I) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); + // Compute the scalar steps and save the results in VectorLoopValueMap. + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); + for (unsigned Lane = 0; Lane < VF; ++Lane) { + auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); auto *Mul = Builder.CreateMul(StartIdx, Step); auto *Add = Builder.CreateAdd(ScalarIV, Mul); - ScalarIVMap[EntryVal].push_back(Add); + Entry[Part][Lane] = Add; } + } + VectorLoopValueMap.initScalar(EntryVal, Entry); } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { @@ -2264,23 +2347,83 @@ return LAI->isUniform(V); } -InnerLoopVectorizer::VectorParts & +const InnerLoopVectorizer::VectorParts & InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); // If we have a stride that is replaced by one, do it here. if (Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. - if (WidenMap.has(V)) - return WidenMap.get(V); + if (VectorLoopValueMap.hasVector(V)) + return VectorLoopValueMap.VectorMapStorage[V]; + + // If the value has not been vectorized, check if it has been scalarized + // instead. If it has been scalarized, and we actually need the value in + // vector form, we will construct the vector values on demand. + if (VectorLoopValueMap.hasScalar(V)) { + + // Initialize a new vector map entry. + VectorParts Entry(UF); + + // If we aren't vectorizing, we can just copy the scalar map values over to + // the vector map. + if (VF == 1) { + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getScalarValue(V, Part, 0); + return VectorLoopValueMap.initVector(V, Entry); + } + + // However, if we are vectorizing, we need to construct the vector values + // using insertelement instructions. Since the resulting vectors are stored + // in VectorLoopValueMap, we will only generate the insertelements once. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Width = 0; Width < VF; ++Width) + Insert = Builder.CreateInsertElement( + Insert, getScalarValue(V, Part, Width), Builder.getInt32(Width)); + Entry[Part] = Insert; + } + return VectorLoopValueMap.initVector(V, Entry); + } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - return WidenMap.splat(V, B); + return VectorLoopValueMap.initVector(V, VectorParts(UF, B)); +} + +Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane) { + + // If the value is not an instruction contained in the loop, it should + // already be scalar. + if (OrigLoop->isLoopInvariant(V)) + return V; + + // If the value from the original loop has not been vectorized, it is + // represented by UF x VF scalar values in the new loop. Return the requested + // scalar value. + if (VectorLoopValueMap.hasScalar(V)) + return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane]; + + // If the value has not been scalarized, get its entry in VectorLoopValueMap + // for the given unroll part. If this entry is not a vector type (i.e., the + // vectorization factor is one), there is no need to generate an + // extractelement instruction. + auto *U = getVectorValue(V)[Part]; + if (!U->getType()->isVectorTy()) { + assert(VF == 1 && "Value not scalarized has non-vector type"); + return U; + } + + // Otherwise, the value from the original loop has been vectorized and is + // represented by UF vector values. Extract and return the requested scalar + // value from the appropriate vector lane. + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2438,15 +2581,10 @@ // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrParts = getVectorValue(Ptr); SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); for (unsigned Part = 0; Part < UF; Part++) { - // Extract the pointer for current instruction from the pointer vector. A - // reverse access uses the pointer in the last lane. - Value *NewPtr = Builder.CreateExtractElement( - PtrParts[Part], - Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + Value *NewPtr = getScalarValue(Ptr, Part, Group->isReverse() ? VF - 1 : 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2470,20 +2608,30 @@ // Vectorize the interleaved load group. if (LI) { + + // For each unroll part, create a wide load for the group. + SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoadInstr = Builder.CreateAlignedLoad( + auto *NewLoad = Builder.CreateAlignedLoad( NewPtrs[Part], Group->getAlignment(), "wide.vec"); + addMetadata(NewLoad, Instr); + NewLoads.push_back(NewLoad); + } - for (unsigned i = 0; i < InterleaveFactor; i++) { - Instruction *Member = Group->getMember(i); + // For each member in the group, shuffle out the appropriate data from the + // wide loads. + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); - // Skip the gaps in the group. - if (!Member) - continue; + // Skip the gaps in the group. + if (!Member) + continue; - Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); + VectorParts Entry(UF); + Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF); + for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { @@ -2491,12 +2639,10 @@ StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - VectorParts &Entry = WidenMap.get(Member); Entry[Part] = Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } - - addMetadata(NewLoadInstr, Instr); + VectorLoopValueMap.initVector(Member, Entry); } return; } @@ -2585,8 +2731,6 @@ if (!ConsecutiveStride && !CreateGatherScatter) return scalarizeInstruction(Instr); - Constant *Zero = Builder.getInt32(0); - VectorParts &Entry = WidenMap.get(Instr); VectorParts VectorGep; // Handle consecutive loads/stores. @@ -2594,9 +2738,7 @@ if (ConsecutiveStride) { if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + auto *FirstBasePtr = getScalarValue(Gep->getPointerOperand(), 0, 0); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -2627,16 +2769,7 @@ OrigLoop)) && "Must be last index or loop invariant"); - VectorParts &GEPParts = getVectorValue(GepOperand); - - // If GepOperand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction. - Value *Index = ScalarIVMap.count(GepOperand) - ? ScalarIVMap[GepOperand][0] - : Builder.CreateExtractElement(GEPParts[0], Zero); - - Gep2->setOperand(i, Index); + Gep2->setOperand(i, getScalarValue(GepOperand, 0, 0)); Gep2->setName("gep.indvar.idx"); } } @@ -2645,8 +2778,7 @@ // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + Ptr = getScalarValue(Ptr, 0, 0); } } else { // At this point we should vector version of GEP for Gather or Scatter @@ -2733,6 +2865,7 @@ // Handle loads. assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewLI; if (CreateGatherScatter) { @@ -2765,6 +2898,7 @@ } addMetadata(NewLI, LI); } + VectorLoopValueMap.initVector(Instr, Entry); } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, @@ -2778,42 +2912,11 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - auto *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block, - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = - IsVoidRetTy ? nullptr - : UndefValue::get(VectorType::get(Instr->getType(), VF)); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; if (IfPredicateInstr) { @@ -2825,6 +2928,7 @@ // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); // For each scalar that we create: for (unsigned Width = 0; Width < VF; ++Width) { @@ -2839,18 +2943,11 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - // If the operand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction if vectorizing. - auto *NewOp = Params[op][Part]; - auto *ScalarOp = Instr->getOperand(op); - if (ScalarIVMap.count(ScalarOp)) - NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; - else if (NewOp->getType()->isVectorTy()) - NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Width); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2858,21 +2955,20 @@ // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][Width] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); // End if-block. if (IfPredicateInstr) PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } + VectorLoopValueMap.initScalar(Instr, Entry); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3556,7 +3652,7 @@ // SmallPtrSet Erased; for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { if (Erased.count(I) || I->use_empty() || !isa(I)) continue; @@ -3648,7 +3744,7 @@ // We'll have created a bunch of ZExts that are now parentless. Clean up. for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { ZExtInst *Inst = dyn_cast(I); if (Inst && Inst->use_empty()) { @@ -3726,7 +3822,7 @@ Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(LoopExitInst); + const VectorParts &VectorExit = getVectorValue(LoopExitInst); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, @@ -3765,10 +3861,10 @@ // Reductions do not have to start at zero. They can start with // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(Phi); + const VectorParts &VecRdxPhi = getVectorValue(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - VectorParts &Val = getVectorValue(LoopVal); + const VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { // Make sure to add the reduction stat value only to the // first unroll part. @@ -3785,7 +3881,7 @@ // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts = getVectorValue(LoopExitInst); + VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); // If the vector reduction can be performed in a smaller type, we truncate @@ -3994,7 +4090,7 @@ // We constructed a temporary phi node in the first phase of vectorization. // This phi node will eventually be deleted. - auto &PhiParts = getVectorValue(Phi); + VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi); Builder.SetInsertPoint(cast(PhiParts[0])); // Create a phi node for the new recurrence. The current value will either be @@ -4304,7 +4400,7 @@ for (unsigned In = 0; In < NumIncoming; In++) { VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + const VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); for (unsigned part = 0; part < UF; ++part) { // We might have single edge PHIs (blocks) - use an identity @@ -4414,8 +4510,6 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { - VectorParts &Entry = WidenMap.get(&I); - switch (I.getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -4423,7 +4517,9 @@ continue; case Instruction::PHI: { // Vectorize PHINodes. + VectorParts Entry(UF); widenPHIInstruction(&I, Entry, UF, VF, PV); + VectorLoopValueMap.initVector(&I, Entry); continue; } // End of PHI. @@ -4454,10 +4550,11 @@ // Just widen binops. auto *BinOp = cast(&I); setDebugLocFromInst(Builder, BinOp); - VectorParts &A = getVectorValue(BinOp->getOperand(0)); - VectorParts &B = getVectorValue(BinOp->getOperand(1)); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); @@ -4467,6 +4564,7 @@ Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, BinOp); break; } @@ -4483,20 +4581,19 @@ // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - VectorParts &Cond = getVectorValue(I.getOperand(0)); - VectorParts &Op0 = getVectorValue(I.getOperand(1)); - VectorParts &Op1 = getVectorValue(I.getOperand(2)); - - Value *ScalarCond = - (VF == 1) - ? Cond[0] - : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); + + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4507,8 +4604,9 @@ bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = dyn_cast(&I); setDebugLocFromInst(Builder, Cmp); - VectorParts &A = getVectorValue(Cmp->getOperand(0)); - VectorParts &B = getVectorValue(Cmp->getOperand(1)); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; if (FCmp) { @@ -4520,6 +4618,7 @@ Entry[Part] = C; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4542,6 +4641,7 @@ case Instruction::BitCast: { auto *CI = dyn_cast(&I); setDebugLocFromInst(Builder, CI); + VectorParts Entry(UF); // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case @@ -4551,6 +4651,7 @@ if (isa(CI) && CI->getOperand(0) == OldInduction && ID.getConstIntStepValue()) { widenIntInduction(OldInduction, Entry, cast(CI)); + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4559,9 +4660,10 @@ Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - VectorParts &A = getVectorValue(CI->getOperand(0)); + const VectorParts &A = getVectorValue(CI->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4600,6 +4702,7 @@ break; } + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { @@ -4607,7 +4710,7 @@ // Some intrinsics have a scalar argument - don't replace it with a // vector. if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); Arg = VectorArg[Part]; } Args.push_back(Arg); @@ -4645,6 +4748,7 @@ Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -6575,40 +6679,11 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - Instruction *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; if (IfPredicateInstr) { @@ -6620,6 +6695,7 @@ // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(1); // For each scalar that we create: // Start an "if (pred) a[i] = ..." block. @@ -6635,29 +6711,30 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - Cloned->setOperand(op, Op); + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); + Cloned->setOperand(op, NewOp); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][0] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Cloned; - // End if-block. if (IfPredicateInstr) PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } + VectorLoopValueMap.initScalar(Instr, Entry); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -103,10 +103,10 @@ ; CHECK-LABEL: @ptr_ind_plus2( ; CHECK: %[[V0:.*]] = load <8 x i32> -; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> -; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> ; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32> Index: llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -39,102 +39,70 @@ ; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> , [[VEC_IND]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP60]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x i32*> undef, i32* [[TMP62]], i32 0 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP64]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x i32*> [[TMP63]], i32* [[TMP66]], i32 1 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 2 +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP68]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i32*> [[TMP67]], i32* [[TMP70]], i32 2 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 3 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP72]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i32*> [[TMP71]], i32* [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP76]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <16 x i32*> [[TMP75]], i32* [[TMP78]], i32 4 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 5 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP80]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <16 x i32*> [[TMP79]], i32* [[TMP82]], i32 5 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 6 +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP84]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <16 x i32*> [[TMP83]], i32* [[TMP86]], i32 6 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 7 +; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP88]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP91:%.*]] = insertelement <16 x i32*> [[TMP87]], i32* [[TMP90]], i32 7 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 8 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 ; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP92]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i32*> [[TMP91]], i32* [[TMP94]], i32 8 -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 9 +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 ; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP96]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <16 x i32*> [[TMP95]], i32* [[TMP98]], i32 9 -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 10 +; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 ; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP100]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <16 x i32*> [[TMP99]], i32* [[TMP102]], i32 10 -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 11 +; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP104]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <16 x i32*> [[TMP103]], i32* [[TMP106]], i32 11 -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 12 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 ; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP108]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i32*> [[TMP107]], i32* [[TMP110]], i32 12 -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 13 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 ; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP112]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP115:%.*]] = insertelement <16 x i32*> [[TMP111]], i32* [[TMP114]], i32 13 -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 14 +; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 ; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP116]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <16 x i32*> [[TMP115]], i32* [[TMP118]], i32 14 -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 15 +; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 ; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP120]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[TMP123:%.*]] = insertelement <16 x i32*> [[TMP119]], i32* [[TMP122]], i32 15 +; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> ) ; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], Index: llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -115,15 +115,6 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; Future-use test for predication under smarter scalar-scalar: this test will -; fail when the vectorizer starts feeding scalarized values directly to their -; scalar users, i.e. w/o generating redundant insertelement/extractelement -; instructions. This case is already supported by the predication code (which -; should generate a phi for the scalar predicated value rather than for the -; insertelement), but cannot be tested yet. -; If you got this test to fail, kindly fix the test by using the alternative -; FFU sequence. This will make the test check how we handle this case from -; now on. define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { entry: br label %for.body @@ -136,18 +127,9 @@ ; CHECK: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] ; CHECK: [[THEN]]: ; CHECK: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} -; CHECK: %[[PDV:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[PD]], i32 0 ; CHECK: br label %[[FI]] ; CHECK: [[FI]]: -; CHECK: %[[PH:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[PDV]], %[[THEN]] ] -; FFU-LABEL: test_scalar2scalar -; FFU: vector.body: -; FFU: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] -; FFU: [[THEN]]: -; FFU: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} -; FFU: br label %[[FI]] -; FFU: [[FI]]: -; FFU: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ] +; CHECK: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ] for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] Index: llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll +++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -11,6 +11,10 @@ br label %for.body ; VEC-LABEL: test +; VEC: %[[v0:.+]] = add i64 %index, 0 +; VEC: %[[v1:.+]] = add i64 %index, 1 +; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] +; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, ; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], @@ -20,8 +24,7 @@ ; ; VEC: [[cond]]: ; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 -; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 +; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: @@ -31,8 +34,7 @@ ; ; VEC: [[cond2]]: ; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 -; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 +; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: