Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -516,11 +516,14 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost extracting vectorized elements. + int getExtractCost(const SmallPtrSetImpl &ScalarsToVec); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool ReduceTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -541,6 +544,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + RemovedOprations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -600,6 +604,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// Reduce the cost of tree to make it patrially vectorizable, if possible. + bool reduceTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -700,6 +707,12 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Cost of the tree entry. + int Cost = 0; + + /// Full cost on expanding tree at this hight. + int ExpandAtCost = 0; }; /// Create a new VectorizableTree entry. @@ -742,6 +755,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOprations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1170,6 +1186,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS, ArrayRef Oprations); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -2413,6 +2432,55 @@ } } +bool BoUpSLP::reduceTree() { + bool Reduced = false; + + // Estimating where to stop vectorization. + unsigned StopAt = 0; + int MinCost = VectorizableTree.back().ExpandAtCost; + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (VectorizableTree[I].ExpandAtCost < MinCost) { + MinCost = VectorizableTree[I].ExpandAtCost; + Reduced = true; + StopAt = I; + } + } + + // Canceling unprofitable elements. + if (StopAt > 0 && StopAt < (VectorizableTree.size() - 1)) { + int ReducedBy = MinCost - VectorizableTree.back().ExpandAtCost; + LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << ReducedBy + << " to make it partially vectorizable.\n"); + Reduced = true; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + if (I >= StopAt) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + RemovedOprations.push_back(I); + ScalarToTreeEntry.erase(V); + MustGather.insert(V); + ExternalUses.erase(std::remove_if(ExternalUses.begin(), + ExternalUses.end(), + [&](ExternalUser &EU) { + return EU.Scalar == V; + }), + ExternalUses.end()); + } + } + } + } + } + + return Reduced; +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); @@ -2457,7 +2525,7 @@ return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost(const SmallPtrSetImpl &ScalarsToVec) { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it @@ -2481,7 +2549,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J) > 0) LiveValues.insert(cast(&*J)); } @@ -2512,23 +2580,51 @@ V.push_back(VectorType::get(II->getType(), BundleWidth)); Cost += TTI->getCostOfKeepingLiveOverCall(V); } - ++PrevInstIt; } - PrevInst = Inst; } return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); +int BoUpSLP::getExtractCost(const SmallPtrSetImpl &ScalarsToVec) { + int ExtractCost = 0; + unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + SmallPtrSet ExtractCostCalculated; + for (ExternalUser &EU : ExternalUses) { - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + // Avoid non-vectorized scalars for this tree hight. + if (ScalarsToVec.count(EU.Scalar) == 0) + continue; + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), + MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? + Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); + } else { + ExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + } + } + return ExtractCost; +} + +int BoUpSLP::getTreeCost(bool ReduceTree) { for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = VectorizableTree[I]; @@ -2551,60 +2647,60 @@ })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; } - SmallPtrSet ExtractCostCalculated; + SmallPtrSet ScalarsToVec; + int CostSum = 0; int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; + int SpillCost = 0; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; + CostSum += Entry->Cost; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + Entry->ExpandAtCost = 0; + // If we have not enountered any gather node before, add gather + // cost for that operation. + if (!Entry->NeedToGather) { + if (I != (VectorizableTree.size() - 1)) { + if (!VectorizableTree[I + 1].NeedToGather) + Entry->ExpandAtCost = getGatherCost(Entry->Scalars); + } + for (Value *V : Entry->Scalars) + ScalarsToVec.insert(V); } - } + Entry->ExpandAtCost += CostSum; + + ExtractCost = getExtractCost(ScalarsToVec); + Entry->ExpandAtCost += ExtractCost; - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Entry->ExpandAtCost += SpillCost; + } + int MinCost = VectorizableTree.back().ExpandAtCost; + if (ReduceTree) { + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (VectorizableTree[I].ExpandAtCost < MinCost) + MinCost = VectorizableTree[I].ExpandAtCost; + } + } std::string Str; { raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; + OS << "SLP: Total Cost = " << MinCost << ".\n"; } LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); - - return Cost; + return MinCost; } int BoUpSLP::getGatherCost(Type *Ty, @@ -3574,7 +3670,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOprations.empty()) + removeFromScheduling(BS, RemovedOprations); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3702,15 +3803,8 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } -#endif + // The tree might not be fully vectorized, so we don't have to + // check every user. Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } @@ -4186,6 +4280,33 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS, ArrayRef Oprations) { + bool Removed = false; + for (int I: Oprations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (Removed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4682,7 +4803,16 @@ const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - const unsigned Sz = R.getVectorElementSize(Chain[0]); + Value *FirstStore = nullptr; + for (Value *V : Chain) { + assert(isa(V) && "Expected only StoreInst here!"); + if (StoreInst *SI = cast(V)) + if (SI->getValueOperand()) + FirstStore = V; + } + if (!FirstStore) + return false; + const unsigned Sz = R.getVectorElementSize(FirstStore); const unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -4703,13 +4833,20 @@ << "\n"); ArrayRef Operands = Chain.slice(i, VF); + // Skip if any store instruction vectorized. + if (std::any_of(Operands.begin(), Operands.end(), + [](Value *V) { + return (!(cast(V))->getValueOperand()); + })) + continue; + R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost(true); LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); @@ -4728,6 +4865,7 @@ // Move to the next bundle. i += VF - 1; + R.reduceTree(); Changed = true; } } Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -39,19 +39,27 @@ ; CHECK-LABEL: @store_chain_v2i64( ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 ; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 ; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 ; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[V0_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[V1_0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[V0_1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[V1_1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i64> [[TMP7]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/addsub.ll +++ test/Transforms/SLPVectorizer/X86/addsub.ll @@ -348,22 +348,28 @@ define void @no_vec_shuff_reorder() #0 { ; CHECK-LABEL: @no_vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> undef, float [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP21]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 Index: test/Transforms/SLPVectorizer/X86/bad_types.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/bad_types.ll +++ test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -12,11 +12,12 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 ; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 -; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 -; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[A_CAST]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B_CAST]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> , [[TMP1]] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1 -; CHECK-NEXT: store i64 [[A_AND]], i64* [[PTR]] -; CHECK-NEXT: store i64 [[B_AND]], i64* [[GEP]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_dequeue.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -11,23 +11,25 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1 ; CHECK-NEXT: [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8 ; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double*> undef, double* [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double*> [[TMP3]], double* [[TMP2]], i32 1 ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] ; CHECK: while.cond.i.preheader: +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double*> undef, double* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double*> [[TMP5]], double* [[TMP2]], i32 1 ; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] ; CHECK: while.cond.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]] ; CHECK: while.body.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: -; CHECK-NEXT: [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x double*> [ [[TMP4]], [[ENTRY:%.*]] ], [ [[TMP6]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>* +; CHECK-NEXT: store <2 x double*> [[TMP7]], <2 x double*>* [[TMP8]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] Index: test/Transforms/SLPVectorizer/X86/crash_gep.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_gep.ll +++ test/Transforms/SLPVectorizer/X86/crash_gep.ll @@ -11,12 +11,13 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; CHECK-NEXT: store i64 [[TMP1]], i64* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i64* [[ARRAYIDX]] to i64 -; CHECK-NEXT: store i64 [[TMP2]], i64* [[ADD_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> undef, i64* [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -126,14 +126,16 @@ define fastcc void @dct36(double* %inbuf) { ; CHECK-LABEL: @dct36( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8 -; CHECK-NEXT: [[ADD46:%.*]] = fadd double [[TMP0]], undef -; CHECK-NEXT: store double [[ADD46]], double* [[ARRAYIDX41]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[INBUF]], align 8 -; CHECK-NEXT: [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]] -; CHECK-NEXT: store double [[ADD49]], double* [[ARRAYIDX44]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double undef, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cse.ll +++ test/Transforms/SLPVectorizer/X86/cse.ll @@ -19,19 +19,20 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> , [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT: [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[MUL11]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> , [[TMP9]] ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: store double [[ADD12]], double* [[ARRAYIDX13]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP10]], <4 x double>* [[TMP11]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal.ll +++ test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -730,54 +730,54 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] ; CHECK: for.body16.lr.ph: ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ADD_PTR]], align 4 ; CHECK-NEXT: br label [[FOR_BODY16:%.*]] ; CHECK: for.cond.cleanup15: -; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY16]] ] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 ; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; CHECK: for.body16: -; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] ; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] -; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 -; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP19]], [[FOR_BODY16]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP10]], 0x3FF19999A0000000 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[TMP11]], 0xBFF3333340000000 ; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] -; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] +; CHECK-NEXT: [[SUB19:%.*]] = fadd fast float [[SUB92]], [[TMP6]] ; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 -; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 -; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 -; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 -; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 -; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] -; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] -; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] -; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> , [[TMP9]] +; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float undef, undef +; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef +; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP12]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[MUL20]] +; CHECK-NEXT: [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]] ; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP19]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; ; STORE-LABEL: @foo( @@ -790,54 +790,54 @@ ; STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] ; STORE-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 ; STORE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] -; STORE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; STORE-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 -; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] -; STORE-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; STORE-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 -; STORE-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] -; STORE-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; STORE-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 -; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] -; STORE-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; STORE-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1 +; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]] +; STORE-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2 +; STORE-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] +; STORE-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3 +; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]] +; STORE-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; STORE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; STORE-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] ; STORE: for.body16.lr.ph: ; STORE-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] -; STORE-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 +; STORE-NEXT: [[TMP6:%.*]] = load float, float* [[ADD_PTR]], align 4 ; STORE-NEXT: br label [[FOR_BODY16:%.*]] ; STORE: for.cond.cleanup15: -; STORE-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] -; STORE-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 -; STORE-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 -; STORE-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 -; STORE-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4 +; STORE-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY16]] ] +; STORE-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; STORE-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; STORE-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 ; STORE-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; STORE: for.body16: -; STORE-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] ; STORE-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] -; STORE-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] -; STORE-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 -; STORE-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 +; STORE-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP19]], [[FOR_BODY16]] ] +; STORE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 +; STORE-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP10]], 0x3FF19999A0000000 +; STORE-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 +; STORE-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[TMP11]], 0xBFF3333340000000 ; STORE-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] -; STORE-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] +; STORE-NEXT: [[SUB19:%.*]] = fadd fast float [[SUB92]], [[TMP6]] ; STORE-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 -; STORE-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 -; STORE-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 -; STORE-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 -; STORE-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 -; STORE-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] -; STORE-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] -; STORE-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] -; STORE-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] +; STORE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> , [[TMP9]] +; STORE-NEXT: [[ADD2293:%.*]] = fadd fast float undef, undef +; STORE-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef +; STORE-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef +; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> undef, <4 x i32> +; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP12]], [[RDX_SHUF]] +; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; STORE-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[MUL20]] +; STORE-NEXT: [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]] ; STORE-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] +; STORE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0 +; STORE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP10]], i32 1 +; STORE-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP14]], i32 2 +; STORE-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 +; STORE-NEXT: [[TMP19]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; entry: Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -15,32 +15,59 @@ ; Vector cost is 6, Scalar cost is 7 ; SSE: Adding cost -1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction) define i32 @test_add(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_add( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = add i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_add( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = add i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_add( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = add i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -136,32 +163,59 @@ ; } define i32 @test_and(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_and( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = and i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_and( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = and i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_and( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = and i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -197,32 +251,59 @@ ; } define i32 @test_or(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_or( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = or i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_or( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = or i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_or( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = or i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -258,32 +339,59 @@ ; } define i32 @test_xor(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_xor( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_xor( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_xor( +; SSE-NEXT: entry: +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef +; SSE-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] +; SSE-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] +; SSE-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] +; SSE-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] +; SSE-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; SSE-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; SSE-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] +; SSE-NEXT: ret i32 [[TMP2]] ; entry: %0 = load i32, i32* %p, align 4 @@ -312,25 +420,45 @@ } define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 { -; CHECK-LABEL: @PR37731( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = xor i32 undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef -; CHECK-NEXT: ret i32 [[TMP9]] +; AVX-LABEL: @PR37731( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 +; AVX-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; AVX-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] +; AVX-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], +; AVX-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], +; AVX-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] +; AVX-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 +; AVX-NEXT: [[TMP7:%.*]] = xor i32 undef, undef +; AVX-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef +; AVX-NEXT: ret i32 [[TMP9]] +; +; SSE-LABEL: @PR37731( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16 +; SSE-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; SSE-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]] +; SSE-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP0]], +; SSE-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 +; SSE-NEXT: [[TMP7:%.*]] = xor i32 undef, undef +; SSE-NEXT: [[TMP8:%.*]] = xor i32 [[TMP7]], undef +; SSE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]] +; SSE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], undef +; SSE-NEXT: ret i32 [[TMP9]] ; entry: %0 = load <4 x i32>, <4 x i32>* %self, align 16 Index: test/Transforms/SLPVectorizer/X86/simplebb.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/simplebb.ll +++ test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -64,15 +64,17 @@ ; CHECK-LABEL: @test_volatile_load( ; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8 ; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[I0]], [[I1]] ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 -; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]] -; CHECK-NEXT: store double [[MUL]], double* [[C:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1 -; CHECK-NEXT: store double [[MUL5]], double* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[I0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[I1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; %i0 = load volatile double, double* %a, align 8 Index: test/Transforms/SLPVectorizer/X86/unreachable.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/unreachable.ll +++ test/Transforms/SLPVectorizer/X86/unreachable.ll @@ -21,19 +21,18 @@ ; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[T7]], align 4 ; CHECK-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 7 ; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[T9]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[T6]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T8]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T10]], i32 3 ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[T1_0:%.*]] = phi i32 [ [[T4]], [[BB1:%.*]] ], [ 2, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ] -; CHECK-NEXT: [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ] -; CHECK-NEXT: [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ] -; CHECK-NEXT: store i32 [[T1_0]], i32* [[X]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ [[TMP3]], [[BB1:%.*]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1 -; CHECK-NEXT: store i32 [[T2_0]], i32* [[T12]], align 4 ; CHECK-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2 -; CHECK-NEXT: store i32 [[T3_0]], i32* [[T13]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3 -; CHECK-NEXT: store i32 [[T4_0]], i32* [[T14]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[X]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: