Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4314,7 +4314,8 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, BoUpSLP &R) { - SetVector Heads, Tails; + SetVector Heads; + SmallDenseSetTails; SmallDenseMap ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the @@ -4322,45 +4323,51 @@ BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Do a quadratic search on all of the given stores and find + // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. SmallVector IndexQueue; - for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - IndexQueue.clear(); + unsigned E = Stores.size(); + IndexQueue.resize(E - 1); + for (unsigned I = E; I > 0; --I) { + unsigned Idx = I - 1; // If a store has multiple consecutive store candidates, search Stores - // array according to the sequence: from i+1 to e, then from i-1 to 0. + // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. - unsigned j = 0; - for (j = i + 1; j < e; ++j) - IndexQueue.push_back(j); - for (j = i; j > 0; --j) - IndexQueue.push_back(j - 1); - - for (auto &k : IndexQueue) { - if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) { - Tails.insert(Stores[k]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[k]; + unsigned Offset = 1; + unsigned Cnt = 0; + for (unsigned J = 0; J < E - 1; ++J, ++Offset) { + if (Idx >= Offset) { + IndexQueue[Cnt] = Idx - Offset; + ++Cnt; + } + if (Idx + Offset < E) { + IndexQueue[Cnt] = Idx + Offset; + ++Cnt; + } + } + + for (auto K : IndexQueue) { + if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) { + Tails.insert(Stores[Idx]); + Heads.insert(Stores[K]); + ConsecutiveChain[Stores[K]] = Stores[Idx]; break; } } } // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) + for (auto *SI : llvm::reverse(Heads)) { + if (Tails.count(SI)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - StoreInst *I = *it; + StoreInst *I = SI; // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; + while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { Operands.push_back(I); // Move to the next value in the chain. I = ConsecutiveChain[I]; Index: test/Transforms/SLPVectorizer/X86/stores_vectorize.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/stores_vectorize.ll +++ test/Transforms/SLPVectorizer/X86/stores_vectorize.ll @@ -25,26 +25,21 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P3]], align 8 -; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[TMP2]], 5 -; CHECK-NEXT: store i64 [[SHR]], i64* [[P3]], align 8 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8 -; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5 -; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 +; CHECK-NEXT: [[AND:%.*]] = and i64 [[TMP6]], 5 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[P4]], align 4 -; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[P4]], align 4 +; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP7]], [[TMP8]] ; CHECK-NEXT: store float [[ADD15]], float* [[P4]], align 4 ; CHECK-NEXT: ret void ;