Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -305,8 +305,9 @@ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), - VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} + OldInduction(nullptr), WidenMap(UnrollFactor, VecWidth), + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). // MinimumBitWidths maps scalar integer values to the smallest bitwidth they @@ -448,6 +449,12 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); + /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -496,37 +503,62 @@ struct ValueMap { /// C'tor. UnrollFactor controls the number of vectors ('parts') that /// are mapped. - ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} + ValueMap(unsigned UnrollFactor, unsigned VecWidth) + : UF(UnrollFactor), VF(VecWidth) {} + + /// \return True if the map has a vector entry for \p Key. + bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } - /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) const { return MapStorage.count(Key); } + /// \return True if the map has a scalar entry for \p Key. + bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } /// Initializes a new entry in the map. Sets all of the vector parts to the /// save value in 'Val'. /// \return A reference to a vector with splat values. VectorParts &splat(Value *Key, Value *Val) { - VectorParts &Entry = MapStorage[Key]; + VectorParts &Entry = VectorMapStorage[Key]; Entry.assign(UF, Val); return Entry; } - ///\return A reference to the value that is stored at 'Key'. - VectorParts &get(Value *Key) { - VectorParts &Entry = MapStorage[Key]; - if (Entry.empty()) - Entry.resize(UF); - assert(Entry.size() == UF); - return Entry; + /// \return A reference to the vector map entry corresponding to \p Key. + VectorParts &getVector(Value *Key) { + return get(Key, VectorMapStorage, UF); } + /// \return A reference to the scalar map entry corresponding to \p Key. + VectorParts &getScalar(Value *Key) { + return get(Key, ScalarMapStorage, UF * VF); + } + + /// Remove the entry corresponding to \p Key from the vector map. + bool eraseVector(Value *Key) { return VectorMapStorage.erase(Key); } + private: - /// The unroll factor. Each entry in the map stores this number of vector - /// elements. + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. unsigned UF; - /// Map storage. We use std::map and not DenseMap because insertions to a - /// dense map invalidates its iterators. - std::map MapStorage; + /// The vectorization factor. Each entry in the scalar map contains UF * VF + /// scalar values. + unsigned VF; + + /// Vector and scalar map storage. We use std::map and not DenseMap because + /// insertions to a dense map invalidates its iterators. + std::map VectorMapStorage; + std::map ScalarMapStorage; + + /// \return A reference to the entry in the given \p Storage map + /// corresponding to \p Key. The entry is guaranteed to contain \p Size + /// elements. + VectorParts &get(Value *Key, std::map &Storage, + unsigned Size) { + VectorParts &Entry = Storage[Key]; + if (Entry.empty()) + Entry.resize(Size); + assert(Entry.size() == Size && "Entry has incorrect size"); + return Entry; + } }; /// The original loop. @@ -590,18 +622,12 @@ PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Maps scalars to widened vectors. - ValueMap WidenMap; - /// A map of induction variables from the original loop to their - /// corresponding VF * UF scalarized values in the vectorized loop. The - /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap - /// maps original loop values to their vector versions in the new loop, - /// ScalarIVMap maps induction variables from the original loop that are not - /// vectorized to their scalar equivalents in the vector loop. Maintaining a - /// separate map for scalarized induction variables allows us to avoid - /// unnecessary scalar-to-vector-to-scalar conversions. - DenseMap> ScalarIVMap; + /// Maps values from the orginal loop to their corresponding values in the + /// vectorized loop. A key value can map to either vector values, scalar + /// values or both kinds of values, depending on whether they key was + /// vectorized and scalarized. + ValueMap WidenMap; /// Store instructions that should be predicated, as a pair /// @@ -2143,13 +2169,14 @@ assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalar steps and save the results in ScalarIVMap. + // Compute the scalar steps and save the results in WidenMap. + auto &Entry = WidenMap.getScalar(EntryVal); for (unsigned Part = 0; Part < UF; ++Part) for (unsigned I = 0; I < VF; ++I) { auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); auto *Mul = Builder.CreateMul(StartIdx, Step); auto *Add = Builder.CreateAdd(ScalarIV, Mul); - ScalarIVMap[EntryVal].push_back(Add); + Entry[VF * Part + I] = Add; } } @@ -2252,8 +2279,42 @@ V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. - if (WidenMap.has(V)) - return WidenMap.get(V); + if (WidenMap.hasVector(V)) + return WidenMap.getVector(V); + + // If the value has not been vectorized, check if it has been scalarized + // instead. If it has been scalarized, and we actually need the value in + // vector form, we will construct the vector values on demand. + if (WidenMap.hasScalar(V)) { + + // If V doesn't produce a value, just create an empty vector entry for it + // in WidenMap. + if (V->getType()->isVoidTy()) + return WidenMap.splat(V, nullptr); + + // Get the vector map entry. + auto &Parts = WidenMap.getVector(V); + + // If we aren't vectorizing, we can just copy the scalar map values over to + // the vector map. + if (VF == 1) { + for (unsigned Part = 0; Part < UF; ++Part) + Parts[Part] = getScalarValue(V, Part, 0); + return Parts; + } + + // However, if we are vectorizing, we need to construct the vector values + // using insertelement instructions. Since the resulting vectors are stored + // in WidenMap, we will only generate the insertelements once. + for (unsigned Part = 0; Part < UF; ++Part) { + Parts[Part] = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Width = 0; Width < VF; ++Width) + Parts[Part] = Builder.CreateInsertElement( + Parts[Part], getScalarValue(V, Part, Width), + Builder.getInt32(Width)); + } + return Parts; + } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. @@ -2261,6 +2322,32 @@ return WidenMap.splat(V, B); } +Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane) { + + // If the value is not an instruction contained in the loop, it should + // already be scalar. + if (OrigLoop->isLoopInvariant(V)) + return V; + + // If the value from the original loop has not been vectorized, it is + // represented by UF * VF scalar values in the new loop. Return the requested + // scalar value. + if (WidenMap.hasScalar(V)) + return WidenMap.getScalar(V)[VF * Part + Lane]; + + // If the value has not been scalarized, it may have been vectorized. Get the + // value corresponding to the requested unroll index. + auto *U = getVectorValue(V)[Part]; + if (!U->getType()->isVectorTy()) + return U; + + // Otherwise, the value from the original loop has been vectorized and is + // represented by UF vector values. Extract and return the requested scalar + // value from the appropriate vector lane. + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); +} + Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); SmallVector ShuffleMask; @@ -2416,15 +2503,10 @@ // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrParts = getVectorValue(Ptr); SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); for (unsigned Part = 0; Part < UF; Part++) { - // Extract the pointer for current instruction from the pointer vector. A - // reverse access uses the pointer in the last lane. - Value *NewPtr = Builder.CreateExtractElement( - PtrParts[Part], - Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + Value *NewPtr = getScalarValue(Ptr, Part, Group->isReverse() ? VF - 1 : 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2469,7 +2551,7 @@ StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - VectorParts &Entry = WidenMap.get(Member); + VectorParts &Entry = WidenMap.getVector(Member); Entry[Part] = Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } @@ -2563,8 +2645,7 @@ if (!ConsecutiveStride && !CreateGatherScatter) return scalarizeInstruction(Instr); - Constant *Zero = Builder.getInt32(0); - VectorParts &Entry = WidenMap.get(Instr); + VectorParts &Entry = WidenMap.getVector(Instr); VectorParts VectorGep; // Handle consecutive loads/stores. @@ -2572,9 +2653,7 @@ if (ConsecutiveStride) { if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + auto *FirstBasePtr = getScalarValue(Gep->getPointerOperand(), 0, 0); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -2605,16 +2684,7 @@ OrigLoop)) && "Must be last index or loop invariant"); - VectorParts &GEPParts = getVectorValue(GepOperand); - - // If GepOperand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction. - Value *Index = ScalarIVMap.count(GepOperand) - ? ScalarIVMap[GepOperand][0] - : Builder.CreateExtractElement(GEPParts[0], Zero); - - Gep2->setOperand(i, Index); + Gep2->setOperand(i, getScalarValue(GepOperand, 0, 0)); Gep2->setName("gep.indvar.idx"); } } @@ -2623,8 +2693,7 @@ // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + Ptr = getScalarValue(Ptr, 0, 0); } } else { // At this point we should vector version of GEP for Gather or Scatter @@ -2753,42 +2822,13 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - auto *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block, - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = - IsVoidRetTy ? nullptr - : UndefValue::get(VectorType::get(Instr->getType(), VF)); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // The instruction will not be vectorized. Erase its vector entry from + // WidenMap and get a new scalar entry instead. + WidenMap.eraseVector(Instr); + auto &Entry = WidenMap.getScalar(Instr); VectorParts Cond; if (IfPredicateStore) { @@ -2814,18 +2854,11 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - // If the operand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction if vectorizing. - auto *NewOp = Params[op][Part]; - auto *ScalarOp = Instr->getOperand(op); - if (ScalarIVMap.count(ScalarOp)) - NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; - else if (NewOp->getType()->isVectorTy()) - NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Width); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2833,16 +2866,14 @@ // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to WidenMap. + Entry[VF * Part + Width] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); // End if-block. if (IfPredicateStore) PredicatedStores.push_back( @@ -3488,7 +3519,7 @@ // SmallPtrSet Erased; for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = WidenMap.getVector(KV.first); for (Value *&I : Parts) { if (Erased.count(I) || I->use_empty() || !isa(I)) continue; @@ -3580,7 +3611,7 @@ // We'll have created a bunch of ZExts that are now parentless. Clean up. for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + VectorParts &Parts = WidenMap.getVector(KV.first); for (Value *&I : Parts) { ZExtInst *Inst = dyn_cast(I); if (Inst && Inst->use_empty()) { @@ -3697,7 +3728,7 @@ // Reductions do not have to start at zero. They can start with // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(Phi); + VectorParts &VecRdxPhi = WidenMap.getVector(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); VectorParts &Val = getVectorValue(LoopVal); @@ -4218,7 +4249,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { - VectorParts &Entry = WidenMap.get(&I); + VectorParts &Entry = WidenMap.getVector(&I); switch (I.getOpcode()) { case Instruction::Br: @@ -4285,10 +4316,7 @@ VectorParts &Op0 = getVectorValue(I.getOperand(1)); VectorParts &Op1 = getVectorValue(I.getOperand(2)); - Value *ScalarCond = - (VF == 1) - ? Cond[0] - : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( @@ -6389,40 +6417,13 @@ setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - Instruction *SrcInst = dyn_cast(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // The instruction will not be vectorized. Erase its vector entry from + // WidenMap and get a new scalar entry instead. + WidenMap.eraseVector(Instr); + auto &Entry = WidenMap.getScalar(Instr); VectorParts Cond; if (IfPredicateStore) { @@ -6449,25 +6450,25 @@ Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - Cloned->setOperand(op, Op); + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); + Cloned->setOperand(op, NewOp); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to WidenMap. + Entry[Part] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Cloned; - // End if-block. if (IfPredicateStore) PredicatedStores.push_back(std::make_pair(cast(Cloned), Cmp)); Index: test/Transforms/LoopVectorize/X86/scatter_crash.ll =================================================================== --- test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -39,102 +39,70 @@ ; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> , [[VEC_IND]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP60]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x i32*> undef, i32* [[TMP62]], i32 0 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP64]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x i32*> [[TMP63]], i32* [[TMP66]], i32 1 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 2 +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP68]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i32*> [[TMP67]], i32* [[TMP70]], i32 2 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 3 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP72]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i32*> [[TMP71]], i32* [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 4 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP76]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <16 x i32*> [[TMP75]], i32* [[TMP78]], i32 4 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 5 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP80]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <16 x i32*> [[TMP79]], i32* [[TMP82]], i32 5 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 6 +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP84]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <16 x i32*> [[TMP83]], i32* [[TMP86]], i32 6 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 7 +; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP88]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP91:%.*]] = insertelement <16 x i32*> [[TMP87]], i32* [[TMP90]], i32 7 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 8 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 ; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP92]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i32*> [[TMP91]], i32* [[TMP94]], i32 8 -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 9 +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 ; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP96]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <16 x i32*> [[TMP95]], i32* [[TMP98]], i32 9 -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 10 +; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 ; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP100]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <16 x i32*> [[TMP99]], i32* [[TMP102]], i32 10 -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 11 +; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP104]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <16 x i32*> [[TMP103]], i32* [[TMP106]], i32 11 -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 12 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 ; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP108]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i32*> [[TMP107]], i32* [[TMP110]], i32 12 -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 13 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 ; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP112]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP115:%.*]] = insertelement <16 x i32*> [[TMP111]], i32* [[TMP114]], i32 13 -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 14 +; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 ; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP116]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP119:%.*]] = insertelement <16 x i32*> [[TMP115]], i32* [[TMP118]], i32 14 -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 15 +; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 ; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP120]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[TMP123:%.*]] = insertelement <16 x i32*> [[TMP119]], i32* [[TMP122]], i32 15 +; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 ; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> ) ; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -12,28 +12,30 @@ br label %for.body ; VEC-LABEL: test +; VEC: %[[v0:.+]] = add i64 %index, 0 +; VEC: %[[v1:.+]] = add i64 %index, 1 +; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] +; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, ; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 ; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true ; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: -; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 +; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: ; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 ; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true ; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: -; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 +; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: