diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -101,6 +101,9 @@ /// Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); + /// \Returns true if \p U is part a store seed. + bool isStoreSeed(User *U) const; + /// Vectorize the store instructions collected in Stores. bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -890,7 +890,8 @@ /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(ArrayRef VectorizedVals = None); + InstructionCost getTreeCost(function_ref IsSeed, + ArrayRef VectorizedVals = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -7030,7 +7031,8 @@ return Prev; } -InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { +InstructionCost BoUpSLP::getTreeCost(function_ref IsSeed, + ArrayRef VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -7074,6 +7076,11 @@ if (isa(EU.Scalar)) continue; + // Skip if this is part of a seed. + if (isa(EU.Scalar) && EU.User && isa(EU.User) && + IsSeed(EU.User)) + continue; + // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null(EU.User)) { @@ -10077,7 +10084,8 @@ R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = + R.getTreeCost([this](User *U) { return isStoreSeed(U); }); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -10376,7 +10384,7 @@ R.buildExternalUses(); R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost([](User *U) { return false; }); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -11216,7 +11224,8 @@ if (auto *FPMO = dyn_cast(U)) RdxFMF &= FPMO->getFastMathFlags(); // Estimate cost. - InstructionCost TreeCost = V.getTreeCost(VL); + InstructionCost TreeCost = + V.getTreeCost([](User *U) { return false; }, VL); InstructionCost ReductionCost = getReductionCost(TTI, VL, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; @@ -12371,6 +12380,20 @@ return Changed; } +bool SLPVectorizerPass::isStoreSeed(User *U) const { + StoreInst *SI = dyn_cast(U); + if (SI == nullptr) + return false; + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + auto It = Stores.find(Ptr); + if (It == Stores.end()) + return false; + const StoreList &SList = It->second; + // Check if StoreVL can be found in SList. + auto ListIt = find(SList, SI); + return ListIt != SList.end(); +} + bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Sort by type, base pointers and values operand. Value operands must be diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/load-used-by-two-stores.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/load-used-by-two-stores.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/load-used-by-two-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/load-used-by-two-stores.ll @@ -10,13 +10,12 @@ define void @load_used_by_two_stores_double(double *%ptr) { ; CHECK-LABEL: @load_used_by_two_stores_double( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 1 -; CHECK-NEXT: [[LD0:%.*]] = load double, double* [[GEP0]], align 8 -; CHECK-NEXT: [[LD1:%.*]] = load double, double* [[GEP1]], align 8 -; CHECK-NEXT: store double [[LD0]], double* [[GEP0]], align 8 -; CHECK-NEXT: store double [[LD1]], double* [[GEP1]], align 8 -; CHECK-NEXT: store double [[LD0]], double* [[GEP0]], align 8 -; CHECK-NEXT: store double [[LD1]], double* [[GEP1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[GEP0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[GEP0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[GEP0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds double, double* %ptr, i64 0 @@ -35,21 +34,12 @@ define void @load_used_by_two_stores_i32(i32 *%ptr) { ; CHECK-LABEL: @load_used_by_two_stores_i32( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3 -; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* [[GEP0]], align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* [[GEP1]], align 8 -; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* [[GEP2]], align 8 -; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* [[GEP3]], align 8 -; CHECK-NEXT: store i32 [[LD0]], i32* [[GEP0]], align 8 -; CHECK-NEXT: store i32 [[LD1]], i32* [[GEP1]], align 8 -; CHECK-NEXT: store i32 [[LD2]], i32* [[GEP2]], align 8 -; CHECK-NEXT: store i32 [[LD3]], i32* [[GEP3]], align 8 -; CHECK-NEXT: store i32 [[LD0]], i32* [[GEP0]], align 8 -; CHECK-NEXT: store i32 [[LD1]], i32* [[GEP1]], align 8 -; CHECK-NEXT: store i32 [[LD2]], i32* [[GEP2]], align 8 -; CHECK-NEXT: store i32 [[LD3]], i32* [[GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[GEP0]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[GEP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[GEP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds i32, i32* %ptr, i64 0 @@ -80,69 +70,12 @@ define void @load_used_by_two_stores_i8(i8 *%ptr) { ; CHECK-LABEL: @load_used_by_two_stores_i8( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 1 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 3 -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 4 -; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 5 -; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 6 -; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 7 -; CHECK-NEXT: [[GEP8:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 8 -; CHECK-NEXT: [[GEP9:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 9 -; CHECK-NEXT: [[GEP10:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 10 -; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 11 -; CHECK-NEXT: [[GEP12:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 12 -; CHECK-NEXT: [[GEP13:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 13 -; CHECK-NEXT: [[GEP14:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 14 -; CHECK-NEXT: [[GEP15:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 15 -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[GEP0]], align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[GEP1]], align 8 -; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[GEP2]], align 8 -; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[GEP3]], align 8 -; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[GEP4]], align 8 -; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[GEP5]], align 8 -; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[GEP6]], align 8 -; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[GEP7]], align 8 -; CHECK-NEXT: [[LD8:%.*]] = load i8, i8* [[GEP8]], align 8 -; CHECK-NEXT: [[LD9:%.*]] = load i8, i8* [[GEP9]], align 8 -; CHECK-NEXT: [[LD10:%.*]] = load i8, i8* [[GEP10]], align 8 -; CHECK-NEXT: [[LD11:%.*]] = load i8, i8* [[GEP11]], align 8 -; CHECK-NEXT: [[LD12:%.*]] = load i8, i8* [[GEP12]], align 8 -; CHECK-NEXT: [[LD13:%.*]] = load i8, i8* [[GEP13]], align 8 -; CHECK-NEXT: [[LD14:%.*]] = load i8, i8* [[GEP14]], align 8 -; CHECK-NEXT: [[LD15:%.*]] = load i8, i8* [[GEP15]], align 8 -; CHECK-NEXT: store i8 [[LD0]], i8* [[GEP0]], align 8 -; CHECK-NEXT: store i8 [[LD1]], i8* [[GEP1]], align 8 -; CHECK-NEXT: store i8 [[LD2]], i8* [[GEP2]], align 8 -; CHECK-NEXT: store i8 [[LD3]], i8* [[GEP3]], align 8 -; CHECK-NEXT: store i8 [[LD4]], i8* [[GEP4]], align 8 -; CHECK-NEXT: store i8 [[LD5]], i8* [[GEP5]], align 8 -; CHECK-NEXT: store i8 [[LD6]], i8* [[GEP6]], align 8 -; CHECK-NEXT: store i8 [[LD7]], i8* [[GEP7]], align 8 -; CHECK-NEXT: store i8 [[LD8]], i8* [[GEP8]], align 8 -; CHECK-NEXT: store i8 [[LD9]], i8* [[GEP9]], align 8 -; CHECK-NEXT: store i8 [[LD10]], i8* [[GEP10]], align 8 -; CHECK-NEXT: store i8 [[LD11]], i8* [[GEP11]], align 8 -; CHECK-NEXT: store i8 [[LD12]], i8* [[GEP12]], align 8 -; CHECK-NEXT: store i8 [[LD13]], i8* [[GEP13]], align 8 -; CHECK-NEXT: store i8 [[LD14]], i8* [[GEP14]], align 8 -; CHECK-NEXT: store i8 [[LD15]], i8* [[GEP15]], align 8 -; CHECK-NEXT: store i8 [[LD0]], i8* [[GEP0]], align 8 -; CHECK-NEXT: store i8 [[LD1]], i8* [[GEP1]], align 8 -; CHECK-NEXT: store i8 [[LD2]], i8* [[GEP2]], align 8 -; CHECK-NEXT: store i8 [[LD3]], i8* [[GEP3]], align 8 -; CHECK-NEXT: store i8 [[LD4]], i8* [[GEP4]], align 8 -; CHECK-NEXT: store i8 [[LD5]], i8* [[GEP5]], align 8 -; CHECK-NEXT: store i8 [[LD6]], i8* [[GEP6]], align 8 -; CHECK-NEXT: store i8 [[LD7]], i8* [[GEP7]], align 8 -; CHECK-NEXT: store i8 [[LD8]], i8* [[GEP8]], align 8 -; CHECK-NEXT: store i8 [[LD9]], i8* [[GEP9]], align 8 -; CHECK-NEXT: store i8 [[LD10]], i8* [[GEP10]], align 8 -; CHECK-NEXT: store i8 [[LD11]], i8* [[GEP11]], align 8 -; CHECK-NEXT: store i8 [[LD12]], i8* [[GEP12]], align 8 -; CHECK-NEXT: store i8 [[LD13]], i8* [[GEP13]], align 8 -; CHECK-NEXT: store i8 [[LD14]], i8* [[GEP14]], align 8 -; CHECK-NEXT: store i8 [[LD15]], i8* [[GEP15]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[GEP0]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[GEP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[GEP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds i8, i8* %ptr, i64 0