diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -776,6 +776,57 @@ Scalars[Mask[I]] = Prev[I]; } +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all operands are either not instructions +/// or phi nodes or instructions from different blocks. +static bool areAllOperandsNonInsts(Value *V) { + auto *I = dyn_cast(V); + if (!I) + return true; + return !I->mayReadOrWriteMemory() && all_of(I->operands(), [I](Value *V) { + auto *IO = dyn_cast(V); + if (!IO) + return true; + return isa(IO) || IO->getParent() != I->getParent(); + }); +} + +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all users are phi nodes or instructions +/// from the different blocks. +static bool isUsedOutsideBlock(Value *V) { + auto *I = dyn_cast(V); + if (!I) + return true; + // Limits the number of uses to save compile time. + constexpr int UsesLimit = 8; + return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && + all_of(I->users(), [I](User *U) { + auto *IU = dyn_cast(U); + if (!IU) + return true; + return IU->getParent() != I->getParent() || isa(IU); + }); +} + +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V) { + return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); +} + +/// Checks if the specified array of instructions does not require scheduling. +/// It is so if all either instructions have operands that do not require +/// scheduling or their users do not require scheduling since they are phis or +/// in other basic blocks. +static bool doesNotNeedToSchedule(ArrayRef VL) { + return !VL.empty() && + (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -2359,15 +2410,21 @@ ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. - unsigned Lane = 0; - for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; - BundleMember = BundleMember->NextInBundle) { - BundleMember->TE = Last; - BundleMember->Lane = Lane; - ++Lane; - } - assert((!Bundle.getValue() || Lane == VL.size()) && + ScheduleData *BundleMember = Bundle.getValue(); + assert((BundleMember || isa(S.MainOp) || + isVectorLikeInstWithConstOps(S.MainOp) || + doesNotNeedToSchedule(VL)) && "Bundle and VL out of sync"); + if (BundleMember) { + for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; + assert(BundleMember && "Unexpected end of bundle."); + BundleMember->TE = Last; + BundleMember = BundleMember->NextInBundle; + } + } + assert(!BundleMember && "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -2504,7 +2561,6 @@ clearDependencies(); OpValue = OpVal; TE = nullptr; - Lane = -1; } /// Verify basic self consistency properties @@ -2544,7 +2600,7 @@ /// Returns true if it represents an instruction bundle and not only a /// single instruction. bool isPartOfBundle() const { - return NextInBundle != nullptr || FirstInBundle != this; + return NextInBundle != nullptr || FirstInBundle != this || TE; } /// Returns true if it is ready for scheduling, i.e. it has no more @@ -2649,9 +2705,6 @@ /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps = InvalidDeps; - /// The lane of this node in the TreeEntry. - int Lane = -1; - /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; @@ -2669,6 +2722,21 @@ friend struct DOTGraphTraits; /// Contains all scheduling data for a basic block. + /// It does not schedules instructions, which are not memory read/write + /// instructions and their operands are either constants, or arguments, or + /// phis, or instructions from others blocks, or their users are phis or from + /// the other blocks. The resulting vector instructions can be placed at the + /// beginning of the basic block without scheduling (if operands does not need + /// to be scheduled) or at the end of the block (if users are outside of the + /// block). It allows to save some compile time and memory used by the + /// compiler. + /// ScheduleData is assigned for each instruction in between the boundaries of + /// the tree entry, even for those, which are not part of the graph. It is + /// required to correctly follow the dependencies between the instructions and + /// their correct scheduling. The ScheduleData is not allocated for the + /// instructions, which do not require scheduling, like phis, nodes with + /// extractelements/insertelements only or nodes with instructions, with + /// uses/operands outside of the block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} @@ -2696,7 +2764,7 @@ if (BB != I->getParent()) // Avoid lookup if can't possibly be in map. return nullptr; - ScheduleData *SD = ScheduleDataMap[I]; + ScheduleData *SD = ScheduleDataMap.lookup(I); if (SD && isInSchedulingRegion(SD)) return SD; return nullptr; @@ -2713,7 +2781,7 @@ return getScheduleData(V); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; + ScheduleData *SD = I->second.lookup(Key); if (SD && isInSchedulingRegion(SD)) return SD; } @@ -2735,7 +2803,7 @@ BundleMember = BundleMember->NextInBundle) { if (BundleMember->Inst != BundleMember->OpValue) continue; - + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2760,7 +2828,9 @@ // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { - int Lane = BundleMember->Lane; + // Need to search for the lane since the tree entry can be reordered. + int Lane = std::distance(TE->Scalars.begin(), + find(TE->Scalars, BundleMember->Inst)); assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion @@ -2769,7 +2839,7 @@ // where their second (immediate) operand is not added. Since // immediates do not affect scheduler behavior this is considered // okay. - auto *In = TE->getMainOp(); + auto *In = BundleMember->Inst; assert(In && (isa(In) || isa(In) || In->getNumOperands() == TE->getNumOperands()) && @@ -2814,7 +2884,8 @@ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { auto *SD = getScheduleData(I); - assert(SD && "primary scheduledata must exist in window"); + if (!SD) + continue; assert(isInSchedulingRegion(SD) && "primary schedule data not in window?"); assert(isInSchedulingRegion(SD->FirstInBundle) && @@ -3856,6 +3927,22 @@ return LoadsState::Gather; } +/// \return true if the specified list of values has only one instruction that +/// requires scheduling, false otherwise. +static bool needToScheduleSingleInstruction(ArrayRef VL) { + Value *NeedsScheduling = nullptr; + for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; + if (!NeedsScheduling) { + NeedsScheduling = V; + continue; + } + return false; + } + return NeedsScheduling; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -6398,6 +6485,20 @@ return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })); + // Set the insert point to the beginning of the basic block if the entry + // should not be scheduled. + if (E->State != TreeEntry::NeedToGather && + doesNotNeedToSchedule(E->Scalars)) { + BasicBlock::iterator InsertPt; + if (all_of(E->Scalars, isUsedOutsideBlock)) + InsertPt = BB->getTerminator()->getIterator(); + else + InsertPt = BB->getFirstInsertionPt(); + Builder.SetInsertPoint(BB, InsertPt); + Builder.SetCurrentDebugLocation(Front->getDebugLoc()); + return; + } + // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -6406,8 +6507,10 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); + Value *V = E->isOneOf(E->Scalars.back()); + if (doesNotNeedToBeScheduled(V)) + V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); + auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -7633,9 +7736,11 @@ BoUpSLP::ScheduleData * BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { - ScheduleData *Bundle = nullptr; + ScheduleData *Bundle = nullptr; ScheduleData *PrevInBundle = nullptr; for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " @@ -7663,7 +7768,8 @@ const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. - if (isa(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue)) + if (isa(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || + doesNotNeedToSchedule(VL)) return nullptr; // Initialize the instruction bundle. @@ -7709,6 +7815,8 @@ // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it // is a new region for the first bundle). This makes it necessary to @@ -7723,6 +7831,8 @@ bool ReSchedule = false; for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -7752,14 +7862,18 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, Value *OpValue) { - if (isa(OpValue) || isVectorLikeInstWithConstOps(OpValue)) + if (isa(OpValue) || isVectorLikeInstWithConstOps(OpValue) || + doesNotNeedToSchedule(VL)) return; + if (doesNotNeedToBeScheduled(OpValue)) + OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); ScheduleData *Bundle = getScheduleData(OpValue); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && "tried to unbundle something which is not a bundle"); // Remove the bundle from the ready list. @@ -7773,6 +7887,7 @@ BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; + BundleMember->TE = nullptr; if (BundleMember->unscheduledDepsInBundle() == 0) { ReadyInsts.insert(BundleMember); } @@ -7796,6 +7911,7 @@ Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && !isVectorLikeInstWithConstOps(I) && + !doesNotNeedToBeScheduled(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to " "be scheduled"); auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { @@ -7872,7 +7988,10 @@ ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + // No need to allocate data for non-schedulable instructions. + if (doesNotNeedToBeScheduled(I)) + continue; + ScheduleData *SD = ScheduleDataMap.lookup(I); if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; @@ -8056,8 +8175,10 @@ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + TreeEntry *SDTE = getTreeEntry(SD->Inst); assert((isVectorLikeInstWithConstOps(SD->Inst) || - SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && + SD->isPartOfBundle() == + (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -36,6 +36,7 @@ ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -85,7 +86,6 @@ ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -111,6 +111,7 @@ ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -160,7 +161,6 @@ ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -297,6 +297,7 @@ ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -346,7 +347,6 @@ ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -372,6 +372,7 @@ ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -421,7 +422,6 @@ ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -35,41 +35,14 @@ ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 -; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 -; MAX-COST-NEXT: [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 -; MAX-COST-NEXT: [[P3:%.*]] = icmp eq i8 [[P2]], 0 -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 -; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 -; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P20:%.*]] = add i32 [[P17]], [[P19]] -; MAX-COST-NEXT: [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], [[P21]] -; MAX-COST-NEXT: [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], [[P23]] -; MAX-COST-NEXT: [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P26:%.*]] = add i32 [[P24]], [[P25]] -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]] -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]] -; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[P30]], [[P31]] -; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: @@ -139,30 +112,14 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 -; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 -; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) -; MAX-COST-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]] -; MAX-COST-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5 -; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] -; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll @@ -14,14 +14,14 @@ ; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.S* [[P:%.*]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25:![0-9]+]] ; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]] -; CHECK-NEXT: [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28:![0-9]+]] +; CHECK-NEXT: [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG29:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]] ; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]] ; CHECK-NEXT: [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]] -; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]] +; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA30]] ; CHECK-NEXT: ret void, !dbg [[DBG37:![0-9]+]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll @@ -11,14 +11,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ] ; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]] ; CHECK: bb3: -; CHECK-NEXT: [[I4:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef +; CHECK-NEXT: [[I4:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: br label [[BB25]] ; CHECK: bb11: -; CHECK-NEXT: [[I12:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef +; CHECK-NEXT: [[I12:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -9,11 +9,11 @@ ; CHECK: loop: ; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 +; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> , [[TMP2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -10,10 +10,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 -; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 +; CHECK-NEXT: store atomic i32 [[TMP4]], i32* [[VALS:%.*]] unordered, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -7,12 +7,12 @@ define i32 @foo(i32* nocapture %A, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -511,9 +511,9 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[ADD5]] = add i32 [[I_018]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] +; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -80,8 +80,8 @@ ; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], ; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> ; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: ; AVX-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -8,16 +8,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP8]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP8]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -27,15 +28,14 @@ ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP9]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP10]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef ; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll @@ -22,8 +22,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll @@ -24,9 +24,9 @@ ; CHECK: for.body: ; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP4]] = mul nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_029]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]] +; CHECK-NEXT: [[TMP4]] = mul nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i32> [ [[TMP1]], [[ENTRY]] ], [ [[TMP4]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll @@ -34,9 +34,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_020]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 +; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -25,22 +25,22 @@ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 +; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP8]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 +; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: ; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP13]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll @@ -21,14 +21,14 @@ ; CHECK-LABEL: @bar( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[D:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[D:%.*]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP5:%.*]] ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 (...) @foo() ; CHECK-NEXT: br label [[TMP7]] ; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP4]], ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8 ; CHECK-NEXT: [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll @@ -58,10 +58,10 @@ define void @test2(i64* %a, i64* %b) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -83,9 +83,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_019]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 +; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* @@ -151,7 +151,7 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 @@ -169,18 +169,18 @@ ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], -; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP20]] = fadd <4 x float> [[TMP6]], [[TMP18]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP20]], i32 0 ; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP20]], i32 1 ; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP20]], i32 2 ; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP20]], i32 3 ; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]] ; CHECK-NEXT: ret float [[ADD31]] ; @@ -245,6 +245,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 @@ -254,8 +256,6 @@ ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3 ; CHECK-NEXT: [[TMP9]] = fmul <4 x float> [[TMP8]], -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -299,17 +299,17 @@ ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], ; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] @@ -510,10 +510,10 @@ ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 @@ -749,47 +749,47 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] ; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] ; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -299,17 +299,17 @@ ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], ; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] @@ -510,10 +510,10 @@ ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 @@ -749,47 +749,47 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] ; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] ; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,12 +6,12 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i64 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F]], i64 0 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 +; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -33,30 +33,30 @@ ; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP12]], 9.000000e+00 ; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 +; CHECK-NEXT: [[TMP14]] = fadd <2 x float> [[TMP5]], [[TMP10]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -21,11 +21,11 @@ ; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G]] to <2 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP2]] ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: sw.epilog: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -1065,14 +1065,14 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { ; SSE-LABEL: @sitofp_4xi32_4f64( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1 ; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1 ; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; SSE-NEXT: [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> ; SSE-NEXT: ret <4 x double> [[RES31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -1065,14 +1065,14 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { ; SSE-LABEL: @sitofp_4xi32_4f64( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1 ; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1 ; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; SSE-NEXT: [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> ; SSE-NEXT: ret <4 x double> [[RES31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -19,15 +19,15 @@ ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: @@ -42,9 +42,9 @@ ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: -; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> +; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -133,27 +133,27 @@ ; MAX256-NEXT: br label [[BB1:%.*]] ; MAX256: bb1: ; MAX256-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float -; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float ; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX256-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 ; MAX256-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] +; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] +; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]] +; MAX256-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]] +; MAX256-NEXT: [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] +; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] ; MAX256-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX256-NEXT: i32 0, label [[BB2:%.*]] ; MAX256-NEXT: i32 1, label [[BB3:%.*]] @@ -166,10 +166,10 @@ ; MAX256: bb5: ; MAX256-NEXT: br label [[BB2]] ; MAX256: bb2: -; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ] +; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] ; MAX256-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX256-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX256-NEXT: ret void @@ -179,27 +179,27 @@ ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: ; MAX1024-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float -; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float ; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX1024-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 ; MAX1024-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX1024-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] +; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] +; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]] +; MAX1024-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]] +; MAX1024-NEXT: [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] +; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] ; MAX1024-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX1024-NEXT: i32 0, label [[BB2:%.*]] ; MAX1024-NEXT: i32 1, label [[BB3:%.*]] @@ -212,10 +212,10 @@ ; MAX1024: bb5: ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: -; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ] +; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] ; MAX1024-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX1024-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX1024-NEXT: ret void