Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -714,9 +714,10 @@ bool Assume = false); /// \brief Returns true if the memory operations \p A and \p B are consecutive. -/// This is a simple API that does not depend on the analysis pass. +/// This is a simple API that does not depend on the analysis pass. bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, - ScalarEvolution &SE, bool CheckType = true); + ScalarEvolution &SE, DominatorTree *DT, + bool CheckType = true, bool CheckSizeType = false); /// \brief This analysis provides dependence information for the memory accesses /// of a loop. Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -996,7 +996,8 @@ /// Returns true if the memory operations \p A and \p B are consecutive. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, - ScalarEvolution &SE, bool CheckType) { + ScalarEvolution &SE, DominatorTree *DT, + bool CheckType, bool CheckSizeType) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -1011,8 +1012,17 @@ return false; // Make sure that A and B have the same type if required. - if(CheckType && PtrA->getType() != PtrB->getType()) - return false; + if (!CheckSizeType && CheckType && PtrA->getType() != PtrB->getType()) + return false; + + // Make sure that A and B have the same size type if required. + Type *PtrATy = PtrA->getType()->getPointerElementType(); + Type *PtrBTy = PtrB->getType()->getPointerElementType(); + if (CheckSizeType && + (DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || + DL.getTypeStoreSize(PtrATy->getScalarType()) != + DL.getTypeStoreSize(PtrBTy->getScalarType()))) + return false; unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *Ty = cast(PtrA->getType())->getElementType(); @@ -1043,7 +1053,76 @@ const SCEV *PtrSCEVA = SE.getSCEV(PtrA); const SCEV *PtrSCEVB = SE.getSCEV(PtrB); const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta); - return X == PtrSCEVB; + if (X == PtrSCEVB) + return true; + + // Sometimes even this doesn't work, because SCEV can't always see through + // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking + // things the hard way. + + // Look through GEPs after checking they're the same except for the last + // index. + GetElementPtrInst *GEPA = dyn_cast(getPointerOperand(A)); + GetElementPtrInst *GEPB = dyn_cast(getPointerOperand(B)); + if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) + return false; + unsigned FinalIndex = GEPA->getNumOperands() - 1; + for (unsigned i = 0; i < FinalIndex; i++) + if (GEPA->getOperand(i) != GEPB->getOperand(i)) + return false; + + Instruction *OpA = dyn_cast(GEPA->getOperand(FinalIndex)); + Instruction *OpB = dyn_cast(GEPB->getOperand(FinalIndex)); + if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || + OpA->getType() != OpB->getType()) + return false; + + // Only look through a ZExt/SExt. + if (!isa(OpA) && !isa(OpA)) + return false; + + bool Signed = isa(OpA); + + OpA = dyn_cast(OpA->getOperand(0)); + OpB = dyn_cast(OpB->getOperand(0)); + if (!OpA || !OpB || OpA->getType() != OpB->getType()) + return false; + + // Now we need to prove that adding 1 to OpA won't overflow. + bool Safe = false; + // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, + // we're okay. + if (OpB->getOpcode() == Instruction::Add && + isa(OpB->getOperand(1)) && + cast(OpB->getOperand(1))->getSExtValue() > 0) { + if (Signed) + Safe = cast(OpB)->hasNoSignedWrap(); + else + Safe = cast(OpB)->hasNoUnsignedWrap(); + } + + unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); + + // Second attempt: + // If any bits are known to be zero other than the sign bit in OpA, we can + // add 1 to it while guaranteeing no overflow of any sort. + if (!Safe && DT) { + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, DT); + KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1); + if (KnownZero != 0) + Safe = true; + } + + if (!Safe) + return false; + + OffsetSCEVA = SE.getSCEV(OpA); + OffsetSCEVB = SE.getSCEV(OpB); + const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); + const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); + return X2 == OffsetSCEVB; } bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -593,7 +593,7 @@ assert((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."); - if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { + if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, nullptr, false)) { if (ForMemset) { if (FirstSplatValue != SecondSplatValue) continue; Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -84,8 +85,6 @@ return DL.getABITypeAlignment(SI->getValueOperand()->getType()); } - bool isConsecutiveAccess(Value *A, Value *B); - /// After vectorization, reorder the instructions that I depends on /// (the instructions defining its operands), to ensure they dominate I. void reorder(Instruction *I); @@ -230,121 +229,6 @@ return -1; } -// FIXME: Merge with llvm::isConsecutiveAccess -bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getPointerAddressSpace(A); - unsigned ASB = getPointerAddressSpace(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Make sure that A and B are different pointers of the same size type. - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); - Type *PtrATy = PtrA->getType()->getPointerElementType(); - Type *PtrBTy = PtrB->getType()->getPointerElementType(); - if (PtrA == PtrB || - DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || - DL.getTypeStoreSize(PtrATy->getScalarType()) != - DL.getTypeStoreSize(PtrBTy->getScalarType())) - return false; - - APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); - - APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); - - APInt OffsetDelta = OffsetB - OffsetA; - - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == Size; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the size. - APInt BaseDelta = Size - OffsetDelta; - - // Compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE.getSCEV(PtrA); - const SCEV *PtrSCEVB = SE.getSCEV(PtrB); - const SCEV *C = SE.getConstant(BaseDelta); - const SCEV *X = SE.getAddExpr(PtrSCEVA, C); - if (X == PtrSCEVB) - return true; - - // Sometimes even this doesn't work, because SCEV can't always see through - // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking - // things the hard way. - - // Look through GEPs after checking they're the same except for the last - // index. - GetElementPtrInst *GEPA = dyn_cast(getPointerOperand(A)); - GetElementPtrInst *GEPB = dyn_cast(getPointerOperand(B)); - if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) - return false; - unsigned FinalIndex = GEPA->getNumOperands() - 1; - for (unsigned i = 0; i < FinalIndex; i++) - if (GEPA->getOperand(i) != GEPB->getOperand(i)) - return false; - - Instruction *OpA = dyn_cast(GEPA->getOperand(FinalIndex)); - Instruction *OpB = dyn_cast(GEPB->getOperand(FinalIndex)); - if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || - OpA->getType() != OpB->getType()) - return false; - - // Only look through a ZExt/SExt. - if (!isa(OpA) && !isa(OpA)) - return false; - - bool Signed = isa(OpA); - - OpA = dyn_cast(OpA->getOperand(0)); - OpB = dyn_cast(OpB->getOperand(0)); - if (!OpA || !OpB || OpA->getType() != OpB->getType()) - return false; - - // Now we need to prove that adding 1 to OpA won't overflow. - bool Safe = false; - // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, - // we're okay. - if (OpB->getOpcode() == Instruction::Add && - isa(OpB->getOperand(1)) && - cast(OpB->getOperand(1))->getSExtValue() > 0) { - if (Signed) - Safe = cast(OpB)->hasNoSignedWrap(); - else - Safe = cast(OpB)->hasNoUnsignedWrap(); - } - - unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); - - // Second attempt: - // If any bits are known to be zero other than the sign bit in OpA, we can - // add 1 to it while guaranteeing no overflow of any sort. - if (!Safe) { - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT); - KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1); - if (KnownZero != 0) - Safe = true; - } - - if (!Safe) - return false; - - const SCEV *OffsetSCEVA = SE.getSCEV(OpA); - const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); - const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); - return X2 == OffsetSCEVB; -} - void Vectorizer::reorder(Instruction *I) { OrderedBasicBlock OBB(I->getParent()); SmallPtrSet InstructionsToMove; @@ -639,7 +523,7 @@ if (i == j) continue; - if (isConsecutiveAccess(Instrs[i], Instrs[j])) { + if (isConsecutiveAccess(Instrs[i], Instrs[j], DL, SE, &DT, true, true)) { if (ConsecutiveChain[i] != -1) { int CurDistance = std::abs(ConsecutiveChain[i] - i); int NewDistance = std::abs(ConsecutiveChain[i] - j); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1181,7 +1181,7 @@ bool Consecutive = true; bool ReverseConsecutive = true; for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE, DT)) { Consecutive = false; break; } else { @@ -1200,7 +1200,7 @@ // check the reverse order. if (ReverseConsecutive) for (unsigned i = VL.size() - 1; i > 0; --i) - if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) { + if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE, DT)) { ReverseConsecutive = false; break; } @@ -1373,7 +1373,7 @@ case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE, DT)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1988,11 +1988,11 @@ if (LoadInst *L1 = dyn_cast(Right[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { + if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j], Right[j]); continue; } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { + isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2003,11 +2003,11 @@ if (LoadInst *L1 = dyn_cast(Left[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { + if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j], Right[j]); continue; } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { + isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2155,7 +2155,7 @@ for (unsigned j = 0; j < VL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { + if (isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2163,7 +2163,7 @@ } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { + if (isConsecutiveAccess(L, L1, *DL, *SE, DT)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -3762,7 +3762,7 @@ IndexQueue.push_back(j - 1); for (auto &k : IndexQueue) { - if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) { + if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE, DT)) { Tails.insert(Stores[k]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[k]; Index: test/Transforms/SLPVectorizer/X86/consecutive-access.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -8,11 +8,10 @@ @D = common global [2000 x float] zeroinitializer, align 16 ; Currently SCEV isn't smart enough to figure out that accesses -; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future -; that would hopefully be fixed. For now, check that this isn't -; vectorized. +; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but llvm::isConsecutiveAccess +; can catch this. Check that this is now vectorized. ; CHECK-LABEL: foo_3double -; CHECK-NOT: x double> +; CHECK: x double> ; Function Attrs: nounwind ssp uwtable define void @foo_3double(i32 %u) #0 { entry: