Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3243,15 +3243,26 @@ // Do a quadratic search on all of the given stores and find // all of the pairs of stores that follow each other. + SmallVector IndexQueue; for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); - if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { - Tails.insert(Stores[j]); + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { + Tails.insert(Stores[k]); Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; + ConsecutiveChain[Stores[i]] = Stores[k]; + break; } } } Index: test/Transforms/SLPVectorizer/X86/pr23510.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr23510.ll +++ test/Transforms/SLPVectorizer/X86/pr23510.ll @@ -0,0 +1,38 @@ +; PR23510 +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: @_Z3fooPml( +; CHECK: lshr <2 x i64> +; CHECK: lshr <2 x i64> + +@total = global i64 0, align 8 + +define void @_Z3fooPml(i64* nocapture %a, i64 %i) { +entry: + %tmp = load i64, i64* %a, align 8 + %shr = lshr i64 %tmp, 4 + store i64 %shr, i64* %a, align 8 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1 + %tmp1 = load i64, i64* %arrayidx1, align 8 + %shr2 = lshr i64 %tmp1, 4 + store i64 %shr2, i64* %arrayidx1, align 8 + %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 %i + %tmp2 = load i64, i64* %arrayidx3, align 8 + %tmp3 = load i64, i64* @total, align 8 + %add = add i64 %tmp3, %tmp2 + store i64 %add, i64* @total, align 8 + %tmp4 = load i64, i64* %a, align 8 + %shr5 = lshr i64 %tmp4, 4 + store i64 %shr5, i64* %a, align 8 + %tmp5 = load i64, i64* %arrayidx1, align 8 + %shr7 = lshr i64 %tmp5, 4 + store i64 %shr7, i64* %arrayidx1, align 8 + %tmp6 = load i64, i64* %arrayidx3, align 8 + %tmp7 = load i64, i64* @total, align 8 + %add9 = add i64 %tmp7, %tmp6 + store i64 %add9, i64* @total, align 8 + ret void +}