Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -305,8 +305,9 @@ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), - VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} + OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). // MinimumBitWidths maps scalar integer values to the smallest bitwidth they @@ -333,11 +334,18 @@ protected: /// A small list of PHINodes. typedef SmallVector PhiVector; - /// When we unroll loops we have multiple vector values for each scalar. - /// This data structure holds the unrolled and vectorized values that - /// originated from one scalar instruction. + + /// A type for vectorized values in the new loop. Each value from the + /// original loop, when vectorized, is represented by UF vector values in the + /// new unrolled loop, where UF is the unroll factor. typedef SmallVector VectorParts; + /// A type for scalarized values in the new loop. Each value from the + /// original loop, when scalarized, is represented by UF x VF scalar values + /// in the new unrolled loop, where UF is the unroll factor and VF is the + /// vectorization factor. + typedef SmallVector, 2> ScalarParts; + // When we if-convert we need to create edge masks. We have to cache values // so that we don't end up with exponential recursion/IR. typedef DenseMap, VectorParts> @@ -441,13 +449,21 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. + /// Return the VectorParts corresponding to \p V from the original loop. If + /// the value has already been vectorized, the corresponding vector entry in + /// VectorLoopValueMap is returned. If, however, the value has a scalar entry + /// in VectorLoopValueMap, we construct new vector values on-demand by + /// inserting the scalar values into vectors with an insertelement sequence. + /// If the value has been neither vectorized nor scalarized, it must be loop + /// invariant, so we simply broadcast the value into vectors. VectorParts &getVectorValue(Value *V); + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); + /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -489,44 +505,88 @@ /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); - /// This is a helper class that holds the vectorizer state. It maps scalar - /// instructions to vector instructions. When the code is 'unrolled' then - /// then a single scalar value is mapped to multiple vector parts. The parts - /// are stored in the VectorPart type. + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with initVector and initScalar, which + /// initialize and return a reference to a new entry. Currently, entries can + /// only be accessed using the getVectorValue and getScalarValue functions + /// from InnerLoopVectorizer. + /// + /// getVectorValue and getScalarValue coordinate to generate a vector or + /// scalar value on-demand if one is not yet available. When vectorizing a + /// loop, we visit the definition of an instruction before its uses. When + /// visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. struct ValueMap { - /// C'tor. UnrollFactor controls the number of vectors ('parts') that - /// are mapped. - ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} - - /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) const { return MapStorage.count(Key); } - - /// Initializes a new entry in the map. Sets all of the vector parts to the - /// save value in 'Val'. - /// \return A reference to a vector with splat values. - VectorParts &splat(Value *Key, Value *Val) { - VectorParts &Entry = MapStorage[Key]; + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UnrollFactor, unsigned VecWidth) + : UF(UnrollFactor), VF(VecWidth) {} + + /// \return True if the map has a vector entry for \p Key. + bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } + + /// \return True if the map has a scalar entry for \p Key. + bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } + + /// \return A reference to a new vector map entry corresponding to \p Key. + /// The key should not already be in the map. If \p Val is provided, each + /// vector value is initialized to this value. + VectorParts &initVector(Value *Key, Value *Val = nullptr) { + assert(!hasVector(Key) && "VectorParts already initialized"); + auto &Entry = VectorMapStorage[Key]; Entry.assign(UF, Val); return Entry; } - ///\return A reference to the value that is stored at 'Key'. - VectorParts &get(Value *Key) { - VectorParts &Entry = MapStorage[Key]; - if (Entry.empty()) - Entry.resize(UF); - assert(Entry.size() == UF); + /// \return A reference to a new scalar map entry corresponding to \p Key. + /// The key should not already be in the map. If \p Val is provided, each + /// scalar value is initialized to this value. + ScalarParts &initScalar(Value *Key, Value *Val = nullptr) { + assert(!hasScalar(Key) && "ScalarParts already initialized"); + auto &Entry = ScalarMapStorage[Key]; + Entry.resize(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].assign(VF, Val); return Entry; } + /// Remove the entry corresponding to \p Key from the vector map. + bool eraseVector(Value *Key) { return VectorMapStorage.erase(Key); } + + /// Retrieve an entry from the vector or scalar maps. The only way to + /// access an existing mapped entry is with getVectorValue or + /// getScalarValue from InnerLoopVectorizer. Until those functions are + /// moved inside ValueMap, we have to declare them as friends. + friend VectorParts &InnerLoopVectorizer::getVectorValue(Value *V); + friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane); + private: - /// The unroll factor. Each entry in the map stores this number of vector - /// elements. + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. unsigned UF; - /// Map storage. We use std::map and not DenseMap because insertions to a - /// dense map invalidates its iterators. - std::map MapStorage; + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The Vector and scalar map storage. + DenseMap VectorMapStorage; + DenseMap ScalarMapStorage; }; /// The original loop. @@ -590,18 +650,12 @@ PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Maps scalars to widened vectors. - ValueMap WidenMap; - - /// A map of induction variables from the original loop to their - /// corresponding VF * UF scalarized values in the vectorized loop. The - /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap - /// maps original loop values to their vector versions in the new loop, - /// ScalarIVMap maps induction variables from the original loop that are not - /// vectorized to their scalar equivalents in the vector loop. Maintaining a - /// separate map for scalarized induction variables allows us to avoid - /// unnecessary scalar-to-vector-to-scalar conversions. - DenseMap> ScalarIVMap; + + /// Maps values from the orginal loop to their corresponding values in the + /// vectorized loop. A key value can map to either vector values, scalar + /// values or both kinds of values, depending on whether they key was + /// vectorized and scalarized. + ValueMap VectorLoopValueMap; /// Store instructions that should be predicated, as a pair /// @@ -2143,13 +2197,14 @@ assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalar steps and save the results in ScalarIVMap. + // Compute the scalar steps and save the results in VectorLoopValueMap. + auto &Entry = VectorLoopValueMap.initScalar(EntryVal); for (unsigned Part = 0; Part < UF; ++Part) - for (unsigned I = 0; I < VF; ++I) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); + for (unsigned Lane = 0; Lane < VF; ++Lane) { + auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); auto *Mul = Builder.CreateMul(StartIdx, Step); auto *Add = Builder.CreateAdd(ScalarIV, Mul); - ScalarIVMap[EntryVal].push_back(Add); + Entry[Part][Lane] = Add; } } @@ -2252,13 +2307,71 @@ V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. - if (WidenMap.has(V)) - return WidenMap.get(V); + if (VectorLoopValueMap.hasVector(V)) + return VectorLoopValueMap.VectorMapStorage[V]; + + // If the value has not been vectorized, check if it has been scalarized + // instead. If it has been scalarized, and we actually need the value in + // vector form, we will construct the vector values on demand. + if (VectorLoopValueMap.hasScalar(V)) { + + // Initialize a new vector map entry. + auto &Entry = VectorLoopValueMap.initVector(V); + + // If V doesn't produce a value, just return the initialized entry. + if (V->getType()->isVoidTy()) + return Entry; + + // If we aren't vectorizing, we can just copy the scalar map values over to + // the vector map. + if (VF == 1) { + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getScalarValue(V, Part, 0); + return Entry; + } + + // However, if we are vectorizing, we need to construct the vector values + // using insertelement instructions. Since the resulting vectors are stored + // in VectorLoopValueMap, we will only generate the insertelements once. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Width = 0; Width < VF; ++Width) + Insert = Builder.CreateInsertElement( + Insert, getScalarValue(V, Part, Width), Builder.getInt32(Width)); + Entry[Part] = Insert; + } + return Entry; + } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. - Value *B = getBroadcastInstrs(V); - return WidenMap.splat(V, B); + return VectorLoopValueMap.initVector(V, getBroadcastInstrs(V)); +} + +Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane) { + + // If the value is not an instruction contained in the loop, it should + // already be scalar. + if (OrigLoop->isLoopInvariant(V)) + return V; + + // If the value from the original loop has not been vectorized, it is + // represented by UF x VF scalar values in the new loop. Return the requested + // scalar value. + if (VectorLoopValueMap.hasScalar(V)) + return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane]; + + // If the value has not been scalarized, it may have been vectorized. Get the + // value corresponding to the requested unroll index. + auto *U = getVectorValue(V)[Part]; + if (!U->getType()->isVectorTy()) + return U; + + // Otherwise, the value from the original loop has been vectorized and is + // represented by UF vector values. Extract and return the requested scalar + // value from the appropriate vector lane. + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2416,15 +2529,10 @@ // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrParts = getVectorValue(Ptr); SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); for (unsigned Part = 0; Part < UF; Part++) { - // Extract the pointer for current instruction from the pointer vector. A - // reverse access uses the pointer in the last lane. - Value *NewPtr = Builder.CreateExtractElement( - PtrParts[Part], - Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + Value *NewPtr = getScalarValue(Ptr, Part, Group->isReverse() ? VF - 1 : 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2448,6 +2556,13 @@ // Vectorize the interleaved load group. if (LI) { + + // Initialize a vector entry for each member of the group in + // VectorLoopValueMap. + for (unsigned I = 0; I < InterleaveFactor; ++I) + if (auto *Member = Group->getMember(I)) + VectorLoopValueMap.initVector(Member); + for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoadInstr = Builder.CreateAlignedLoad( NewPtrs[Part], Group->getAlignment(), "wide.vec"); @@ -2469,7 +2584,7 @@ StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - VectorParts &Entry = WidenMap.get(Member); + VectorParts &Entry = getVectorValue(Member); Entry[Part] = Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } @@ -2563,8 +2678,7 @@ if (!ConsecutiveStride && !CreateGatherScatter) return scalarizeInstruction(Instr); - Constant *Zero = Builder.getInt32(0); - VectorParts &Entry = WidenMap.get(Instr); + VectorParts &Entry = getVectorValue(Instr); VectorParts VectorGep; // Handle consecutive loads/stores. @@ -2572,9 +2686,7 @@ if (ConsecutiveStride) { if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + auto *FirstBasePtr = getScalarValue(Gep->getPointerOperand(), 0, 0); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -2605,16 +2717,7 @@ OrigLoop)) && "Must be last index or loop invariant"); - VectorParts &GEPParts = getVectorValue(GepOperand); - - // If GepOperand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction. - Value *Index = ScalarIVMap.count(GepOperand) - ? ScalarIVMap[GepOperand][0] - : Builder.CreateExtractElement(GEPParts[0], Zero); - - Gep2->setOperand(i, Index); + Gep2->setOperand(i, getScalarValue(GepOperand, 0, 0)); Gep2->setName("gep.indvar.idx"); } } @@ -2623,8 +2726,7 @@ // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + Ptr = getScalarValue(Ptr, 0, 0); } } else { // At this point we should vector version of GEP for Gather or Scatter @@ -2753,42 +2855,13 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - auto *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block, - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = - IsVoidRetTy ? nullptr - : UndefValue::get(VectorType::get(Instr->getType(), VF)); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // The instruction will not be vectorized. Erase its vector entry from + // VectorLoopValueMap and get a new scalar entry instead. + VectorLoopValueMap.eraseVector(Instr); + auto &Entry = VectorLoopValueMap.initScalar(Instr); VectorParts Cond; if (IfPredicateStore) { @@ -2814,18 +2887,11 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - // If the operand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction if vectorizing. - auto *NewOp = Params[op][Part]; - auto *ScalarOp = Instr->getOperand(op); - if (ScalarIVMap.count(ScalarOp)) - NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; - else if (NewOp->getType()->isVectorTy()) - NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Width); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2833,16 +2899,14 @@ // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to VectorLoopValueMap. + Entry[Part][Width] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); // End if-block. if (IfPredicateStore) PredicatedStores.push_back( @@ -3488,7 +3552,7 @@ // SmallPtrSet Erased; for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = getVectorValue(KV.first); for (Value *&I : Parts) { if (Erased.count(I) || I->use_empty() || !isa(I)) continue; @@ -3580,7 +3644,7 @@ // We'll have created a bunch of ZExts that are now parentless. Clean up. for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = getVectorValue(KV.first); for (Value *&I : Parts) { ZExtInst *Inst = dyn_cast(I); if (Inst && Inst->use_empty()) { @@ -3697,7 +3761,7 @@ // Reductions do not have to start at zero. They can start with // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(Phi); + VectorParts &VecRdxPhi = getVectorValue(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); VectorParts &Val = getVectorValue(LoopVal); @@ -4218,7 +4282,16 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { - VectorParts &Entry = WidenMap.get(&I); + + // Instructions in interleaved access groups are vectorized all at once. + // We need to check for interleaved groups here so we don't attempt to + // initialize entries in VectorLoopValueMap more than once. + if (Legal->isAccessInterleaved(&I)) { + vectorizeInterleaveGroup(&I); + continue; + } + + VectorParts &Entry = VectorLoopValueMap.initVector(&I); switch (I.getOpcode()) { case Instruction::Br: @@ -4285,10 +4358,7 @@ VectorParts &Op0 = getVectorValue(I.getOperand(1)); VectorParts &Op1 = getVectorValue(I.getOperand(2)); - Value *ScalarCond = - (VF == 1) - ? Cond[0] - : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( @@ -6391,40 +6461,13 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - Instruction *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // The instruction will not be vectorized. Erase its vector entry from + // VectorLoopValueMap and get a new scalar entry instead. + VectorLoopValueMap.eraseVector(Instr); + auto &Entry = VectorLoopValueMap.initScalar(Instr); VectorParts Cond; if (IfPredicateStore) { @@ -6451,25 +6494,25 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - Cloned->setOperand(op, Op); + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); + Cloned->setOperand(op, NewOp); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to VectorLoopValueMap. + Entry[Part][0] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Cloned; - // End if-block. if (IfPredicateStore) PredicatedStores.push_back(std::make_pair(cast(Cloned), Cmp)); Index: test/Transforms/LoopVectorize/X86/scatter_crash.ll =================================================================== --- test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -39,102 +39,70 @@ ; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> , [[VEC_IND]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP60]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x i32*> undef, i32* [[TMP62]], i32 0 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP64]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x i32*> [[TMP63]], i32* [[TMP66]], i32 1 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 2 +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP68]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i32*> [[TMP67]], i32* [[TMP70]], i32 2 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 3 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP72]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i32*> [[TMP71]], i32* [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP76]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <16 x i32*> [[TMP75]], i32* [[TMP78]], i32 4 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 5 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP80]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <16 x i32*> [[TMP79]], i32* [[TMP82]], i32 5 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 6 +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP84]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <16 x i32*> [[TMP83]], i32* [[TMP86]], i32 6 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 7 +; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP88]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP91:%.*]] = insertelement <16 x i32*> [[TMP87]], i32* [[TMP90]], i32 7 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 8 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 ; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP92]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i32*> [[TMP91]], i32* [[TMP94]], i32 8 -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 9 +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 ; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP96]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <16 x i32*> [[TMP95]], i32* [[TMP98]], i32 9 -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 10 +; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 ; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP100]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <16 x i32*> [[TMP99]], i32* [[TMP102]], i32 10 -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 11 +; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP104]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <16 x i32*> [[TMP103]], i32* [[TMP106]], i32 11 -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 12 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 ; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP108]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i32*> [[TMP107]], i32* [[TMP110]], i32 12 -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 13 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 ; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP112]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP115:%.*]] = insertelement <16 x i32*> [[TMP111]], i32* [[TMP114]], i32 13 -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 14 +; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 ; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP116]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <16 x i32*> [[TMP115]], i32* [[TMP118]], i32 14 -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 15 +; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 ; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP120]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[TMP123:%.*]] = insertelement <16 x i32*> [[TMP119]], i32* [[TMP122]], i32 15 +; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> ) ; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -12,28 +12,30 @@ br label %for.body ; VEC-LABEL: test +; VEC: %[[v0:.+]] = add i64 %index, 0 +; VEC: %[[v1:.+]] = add i64 %index, 1 +; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] +; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, ; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 ; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true ; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: -; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 +; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: ; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 ; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true ; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: -; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 +; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: