diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -679,6 +679,12 @@ const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false, bool ShouldCheckWrap = true); +/// Returns the distance between the pointers \p PtrA and \p PtrB iff they are +/// compatible and it is possible to calculate the distance between them. This +/// is a simple API that does not depend on the analysis pass. +Optional getPointersDiff(Value *PtrA, Value *PtrB, const DataLayout &DL, + ScalarEvolution &SE); + /// Attempt to sort the pointers in \p VL and return the sorted indices /// in \p SortedIndices, if reordering is required. /// diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1124,6 +1124,67 @@ return Stride; } +Optional llvm::getPointersDiff(Value *PtrA, Value *PtrB, + const DataLayout &DL, ScalarEvolution &SE) { + unsigned ASA = PtrA->getType()->getPointerAddressSpace(); + unsigned ASB = PtrB->getType()->getPointerAddressSpace(); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB) || PtrA->getType() != PtrB->getType()) + return None; + + // Make sure that A and B are different pointers. + if (PtrA == PtrB) + return 0; + + unsigned IdxWidth = DL.getIndexSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + + APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); + Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + IdxWidth = DL.getIndexSizeInBits(ASA); + APInt Size(IdxWidth, DL.getTypeStoreSize(Ty)); + if (PtrA1 == PtrB1) { + // Retrieve the address space again as pointer stripping now tracks through + // `addrspacecast`. + ASA = cast(PtrA1->getType())->getAddressSpace(); + ASB = cast(PtrB1->getType())->getAddressSpace(); + // Check that the address spaces match and that the pointers are valid. + if (ASA != ASB) + return None; + + OffsetA = OffsetA.sextOrTrunc(IdxWidth); + OffsetB = OffsetB.sextOrTrunc(IdxWidth); + + // OffsetDelta = OffsetB - OffsetA; + const SCEV *OffsetSCEVA = SE.getConstant(OffsetA); + const SCEV *OffsetSCEVB = SE.getConstant(OffsetB); + const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); + const APInt &OffsetDelta = cast(OffsetDeltaSCEV)->getAPInt(); + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + int Val = OffsetDelta.getSExtValue() / Size.getSExtValue(); + if (Val * Size == OffsetDelta) + return Val; + return None; + } + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE.getSCEV(PtrA); + const SCEV *PtrSCEVB = SE.getSCEV(PtrB); + const auto *Diff = + dyn_cast(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA)); + if (Diff) { + int Val = Diff->getAPInt().getSExtValue() / Size.getSExtValue(); + if (Val * Size == Diff->getAPInt()) + return Val; + } + return None; +} + bool llvm::sortPtrAccesses(ArrayRef VL, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { @@ -1136,32 +1197,17 @@ // Walk over the pointers, and map each of them to an offset relative to // first pointer in the array. Value *Ptr0 = VL[0]; - const SCEV *Scev0 = SE.getSCEV(Ptr0); - Value *Obj0 = getUnderlyingObject(Ptr0); + OffValPairs.emplace_back(0, Ptr0); llvm::SmallSet Offsets; - for (auto *Ptr : VL) { - // TODO: Outline this code as a special, more time consuming, version of - // computeConstantDifference() function. - if (Ptr->getType()->getPointerAddressSpace() != - Ptr0->getType()->getPointerAddressSpace()) - return false; - // If a pointer refers to a different underlying object, bail - the - // pointers are by definition incomparable. - Value *CurrObj = getUnderlyingObject(Ptr); - if (CurrObj != Obj0) - return false; - - const SCEV *Scev = SE.getSCEV(Ptr); - const auto *Diff = dyn_cast(SE.getMinusSCEV(Scev, Scev0)); - // The pointers may not have a constant offset from each other, or SCEV - // may just not be smart enough to figure out they do. Regardless, - // there's nothing we can do. + Offsets.insert(0); + for (auto *Ptr : VL.drop_front()) { + Optional Diff = getPointersDiff(Ptr0, Ptr, DL, SE); if (!Diff) return false; // Check if the pointer with the same offset is found. - int64_t Offset = Diff->getAPInt().getSExtValue(); + int64_t Offset = *Diff; if (!Offsets.insert(Offset).second) return false; OffValPairs.emplace_back(Offset, Ptr); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -941,10 +941,15 @@ ScalarEvolution &SE) { auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); - if (LI1 && LI2) - return isConsecutiveAccess(LI1, LI2, DL, SE) - ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (LI1 && LI2) { + if (LI1->getParent() != LI2->getParent()) + return VLOperands::ScoreFail; + + Optional Dist = getPointersDiff(LI1->getPointerOperand(), + LI2->getPointerOperand(), DL, SE); + return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + } auto *C1 = dyn_cast(V1); auto *C2 = dyn_cast(V2); @@ -2871,13 +2876,9 @@ Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional Diff = getPointersDiff(Ptr0, PtrN, *DL, *SE); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast(*Diff) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -3150,13 +3151,9 @@ Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE); // Check that the sorted pointer operands are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -6107,20 +6104,41 @@ int E = Stores.size(); SmallBitVector Tails(E, false); - SmallVector ConsecutiveChain(E, E + 1); int MaxIter = MaxStoreLookup.getValue(); + SmallVector, 16> ConsecutiveChain( + E, std::make_pair(E, INT_MAX)); + SmallVector CheckedPairs(E, SmallBitVector(E, false)); int IterCnt; auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &CheckedPairs, &ConsecutiveChain](int K, int Idx) { if (IterCnt >= MaxIter) return true; + if (CheckedPairs[Idx].test(K)) + return ConsecutiveChain[K].second == 1 && + ConsecutiveChain[K].first == Idx; ++IterCnt; - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + CheckedPairs[Idx].set(K); + CheckedPairs[K].set(Idx); + Optional Diff = + getPointersDiff(Stores[K]->getPointerOperand(), + Stores[Idx]->getPointerOperand(), *DL, *SE); + if (!Diff || *Diff == 0) + return false; + int Val = *Diff; + if (Val < 0) { + if (ConsecutiveChain[Idx].second > -Val) { + Tails.set(K); + ConsecutiveChain[Idx] = std::make_pair(K, -Val); + } + return false; + } + if (ConsecutiveChain[K].second <= Val) return false; Tails.set(Idx); - ConsecutiveChain[K] = Idx; - return true; + ConsecutiveChain[K] = std::make_pair(Idx, Val); + return Val == 1; }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. @@ -6140,16 +6158,28 @@ // For stores that start but don't end a link in the chain: for (int Cnt = E; Cnt > 0; --Cnt) { int I = Cnt - 1; - if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) + if (ConsecutiveChain[I].first == E || Tails.test(I)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; // Collect the chain into a list. - while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + while (I != E && !VectorizedStores.count(Stores[I])) { Operands.push_back(Stores[I]); + Tails.set(I); + if (ConsecutiveChain[I].second != 1) { + // Mark the new end in the chain and go back, if required. It might be + // required if the original stores comes in reversed order, for example. + if (ConsecutiveChain[I].first != E && + Tails.test(ConsecutiveChain[I].first)) { + Tails.reset(ConsecutiveChain[I].first); + if (Cnt < ConsecutiveChain[I].first + 2) + Cnt = ConsecutiveChain[I].first + 2; + } + break; + } // Move to the next value in the chain. - I = ConsecutiveChain[I]; + I = ConsecutiveChain[I].first; } // If a vector register can't hold 1 element, we are done. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX %class.1 = type { %class.2 } %class.2 = type { %"class.3" } @@ -117,13 +117,10 @@ ; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 ; AVX-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* ; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i32 0 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], -; AVX-NEXT: [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1 +; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; AVX-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; AVX-NEXT: ret void ; entry: