diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -143,9 +143,13 @@ unsigned Destination; /// The type of the dependence. DepType Type; + /// The reorder informations. Only Backward may have this. + SmallVector ReorderInfo; - Dependence(unsigned Source, unsigned Destination, DepType Type) - : Source(Source), Destination(Destination), Type(Type) {} + Dependence(unsigned Source, unsigned Destination, DepType Type, + SmallVector ReorderInfo) + : Source(Source), Destination(Destination), Type(Type), + ReorderInfo(ReorderInfo){} /// Return the source instruction of the dependence. Instruction *getSource(const LoopAccessInfo &LAI) const; @@ -205,6 +209,10 @@ return Status == VectorizationSafetyStatus::Safe; } + /// Returns true if all the unsafe dependences are backward and have + /// nonempty reorder informations. + bool canUnsafeDepsVectorize() const; + /// Return true if the number of elements that are safe to operate on /// simultaneously is not bounded. bool isSafeForAnyVectorWidth() const { @@ -280,6 +288,11 @@ // We can access this many bytes in parallel safely. uint64_t MaxSafeDepDistBytes; + /// True if there is an unsafe backward dependence whose + /// minmum safe distance is greater than another backward dependence's + /// distance which is already known. + bool UnsafeBackwardMinDistGTOtherBackwardDist; + /// Number of elements (from consecutive iterations) that are safe to /// operate on simultaneously, multiplied by the size of the element in bits. /// The size of the element is taken from the memory access that is most @@ -320,6 +333,37 @@ const MemAccessInfo &B, unsigned BIdx, const ValueToValueMap &Strides); + /// Returns true if the DESTINST instruction can be moved before + /// the SRCINST instruction, and add all the involved instructions + /// to the INSTS list. + /// + /// This method do the following checks recursively. + /// step1. call canMoveInstrs to check if all the operands of + /// DESTINST can be moved + /// step2. call canMoveBetweenInstrs to check if the instructions + /// between SRCINST and DESTINST which read/write the same + /// underly object as DESTINST does can be moved. + bool canMoveInstrs(llvm::Instruction *SrcInst, llvm::Instruction *DestInst, + SmallVector &Insts, + const ValueToValueMap &Strides); + + /// Returns true if the instructions between SRCINST and DESTINST + /// which read/write the same underly object as DESTINST does can + /// be moved before the SRCINST instruction. + /// + /// This method is called by canMoveInstrs and calls canMoveInstrs + /// too. Also see canMoveInstrs. + bool canMoveBetweenInstrs(llvm::Instruction *SrcInst, + llvm::Instruction *DestInst, + SmallVector &Insts, + const ValueToValueMap &Strides); + + /// Returns true if the accesses can be turned to forward + /// dependence with instructions reordering. + bool isReorderingToForwardPossible( + unsigned AIdx, unsigned BIdx, const ValueToValueMap &Strides, + SmallVector &ReorderNeededInstructionList); + /// Check whether the data dependence could prevent store-load /// forwarding. /// diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1623,6 +1623,7 @@ if (MinDistanceNeeded > MaxSafeDepDistBytes) { LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " << MinDistanceNeeded << " size in bytes"); + UnsafeBackwardMinDistGTOtherBackwardDist = true; return Dependence::Backward; } @@ -1658,11 +1659,283 @@ return Dependence::BackwardVectorizable; } +bool MemoryDepChecker::canMoveInstrs( + Instruction *SrcInst, Instruction *DestInst, + SmallVector &Insts, const ValueToValueMap &Strides) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): " << *SrcInst << "\n" + << " <---> " << *DestInst << "\n"); + + bool CanMove = true; + unsigned OpNums; + + // Recursive exit 1: already checked + if (std::find(Insts.begin(), Insts.end(), DestInst) != Insts.end()) + return true; + else + Insts.push_back(DestInst); + + // Recursive exit 2: the DestInst is same as SrcInst + if (DestInst == SrcInst) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): The source instruction and " + << "destination instruction are same!!!\n"); + return false; + } + + // Recursive exit 3: Both access the same address + // In the process of recursion, the DestInst and StrInst may operate + // on the same address. + if (isa(DestInst) || isa(DestInst)) { + Value *SrcPtr, *DestPtr; + + if (isa(SrcInst)) + SrcPtr = cast(SrcInst)->getPointerOperand(); + else + SrcPtr = cast(SrcInst)->getPointerOperand(); + + if (isa(DestInst)) + DestPtr = cast(DestInst)->getPointerOperand(); + else + DestPtr = cast(DestInst)->getPointerOperand(); + + if (SrcPtr == DestPtr) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Store/Load instructions access " + << "same address!!!\n"); + return false; + } + } + + // STEP1: Check the operands + OpNums = DestInst->getNumOperands(); + for (unsigned Idx = 0; Idx < OpNums; Idx++) { + Instruction *OpInst = dyn_cast(DestInst->getOperand(Idx)); + + // Maybe a llvm::Value, E.g.: + // a constant + // a parameter variant + // a global variant + if (!OpInst) + continue; + + BasicBlock *OpBB = OpInst->getParent(); + BasicBlock *DestBB = DestInst->getParent(); + // This basic block may use instructions of other basic block. + // And the other basic block may also use instruction of this + // basic block, so go on checking. + if (OpBB != DestBB) { + // Use instruction out of current Function + // FIXME: Don't know if this is necessary. + if (OpBB->getParent() != DestBB->getParent()) + continue; + + // FIXME: Don't know if there is any situation that can stop + // the checking here. + } + + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Operand Instructions' Check\n"); + CanMove = canMoveInstrs(SrcInst, OpInst, Insts, Strides); + if (!CanMove) + return false; + } + + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Between Instructions' Check\n"); + + // STEP2: Check the instructions between the SrcInst and DestInst. + if (SrcInst->getParent() == DestInst->getParent() && + SrcInst->comesBefore(DestInst)) + return canMoveBetweenInstrs(SrcInst, DestInst, Insts, Strides); + else + return true; +} + +bool MemoryDepChecker::canMoveBetweenInstrs( + Instruction *SrcInst, Instruction *DestInst, + SmallVector &Insts, const ValueToValueMap &Strides) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): " << *SrcInst << "\n" + << " <---> " << *DestInst << "\n"); + + bool CanMove = true; + + Value *Ptr = nullptr; + if (isa(DestInst)) + Ptr = cast(DestInst)->getPointerOperand(); + else if (isa(DestInst)) + Ptr = cast(DestInst)->getPointerOperand(); + + // DestInst is not load/store instruction. + if (!Ptr) + return true; + + int64_t Stride = getPtrStride(PSE, Ptr, InnermostLoop, Strides, true, true); + Value *UnderlyingObject = getUnderlyingObject(Ptr, 6); + + for (BasicBlock::iterator I = --(DestInst->getIterator()), + Begin = SrcInst->getIterator(); + I != Begin; --I) { + // The current instruction maybe already checked as operand. + if (std::find(Insts.begin(), Insts.end(), (&*I)) != Insts.end()) + continue; + + // FIXME: Don't know how to check call instruction that + // read/wirte memory. + if (isa(&*I) && + ((&*I)->mayReadFromMemory() || (&*I)->mayWriteToMemory())) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found memory access call: " + << (*I) << "\n"); + return false; + } + + if (isa(&*I) || isa(&*I)) { + Value *PtrTemp = nullptr; + if (isa(&*I)) + PtrTemp = cast(&*I)->getPointerOperand(); + else if (isa(&*I)) + PtrTemp = cast(&*I)->getPointerOperand(); + + Value *UnderlyingObjectTemp = getUnderlyingObject(PtrTemp, 6); + + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo):\n" + << " UnderlyingObject: " << *UnderlyingObject << "\n" + << " UnderlyingObjectTemp: " << *UnderlyingObjectTemp + << "\n"); + + // For some reason the return value of getUnderlyingObject + // function is still an Instruction. So the following if + // statement maybe false, and the current instruction does not + // need move before. This is an assumption. If the current + // instruction do need move before, that says there is a + // memory conflict between the current instruction and the + // DestInst. And the runtime memory check can detect the + // conflict, then go to run the origin scalar loop. + if (UnderlyingObjectTemp == UnderlyingObject) { + const SCEV *First = PSE.getSCEV(PtrTemp); + const SCEV *Second = PSE.getSCEV(Ptr); + const SCEV *Dist = PSE.getSE()->getMinusSCEV(Second, First); + const SCEVConstant *C = dyn_cast(Dist); + + if (!C) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found non constant distance\n"); + return false; + } + + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Distance between:\n" + << " " << *PtrTemp << "\n" + << " " << *Ptr << "\n" + << " is " << Distance << "\n"); + + // Related instruction needs to be moved before together, so + // check if it can be moved. E.g. + // + // Stride > 0 + // SrcInst: c[i] = a[i]; + // RelatedInst: b[i + 1] = c[i]; + // DestInst: a[i + 1] = b[i]; + // + // Stride < 0 + // SrcInst: c[i] = a[i]; + // RelatedInst: b[i - 1] = c[i]; + // DestInst: a[i - 1] = b[i]; + if ((Stride > 0 && Distance <= 0) || (Stride < 0 && Distance >= 0)) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found an related instrctions\n"); + CanMove = canMoveInstrs(SrcInst, &*I, Insts, Strides); + if (!CanMove) + return false; + } + } + } + } + + return CanMove; +} + +bool MemoryDepChecker::isReorderingToForwardPossible( + unsigned AIdx, unsigned BIdx, const ValueToValueMap &Strides, + SmallVector &ReorderNeededInstructionList) { + Instruction *SourceInst = InstMap[AIdx]; + Instruction *DestInst = InstMap[BIdx]; + + if (SourceInst->getParent() != DestInst->getParent()) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Accesses in different basic " + << "blocks are not support to reordering.\n" + << " Src: " << *SourceInst << "\n" + << " Dest: " << *DestInst << "\n"); + return false; + } + + // For the following case, if the DestInst is moved before the + // SourceInst, it should not prevent store-load forwarding. + // SourceInst: load a[i] + // DestInst: store a[i + n] + if (isa(SourceInst) && isa(DestInst) && + EnableForwardingConflictDetection) { + // Get the distance between the SourceInst and the DestInst. + Value *SourcePtr = cast(SourceInst)->getPointerOperand(); + Value *DestPtr = cast(DestInst)->getPointerOperand(); + llvm::Type *SourceTy = SourcePtr->getType()->getPointerElementType(); + const auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); + uint64_t TypeByteSize = DL.getTypeAllocSize(SourceTy); + const SCEV *Source = PSE.getSCEV(SourcePtr); + const SCEV *Dest = PSE.getSCEV(DestPtr); + const SCEV *Dist = PSE.getSE()->getMinusSCEV(Dest, Source); + const SCEVConstant *C = dyn_cast(Dist); + + if (!C) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found not vectorizable store " + << "after load dependence (non consistant distance):\n" + << " Src: " << *SourceInst << "\n" + << " Dest: " << *DestInst << "\n"); + return false; + } + + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + + // Do a check as MemoryDepChecker::couldPreventStoreLoadForward. + // + // The difference is that, here the distance is little than the + // minimum distance that vectorization need, so just need the + // following check. + if (Distance % (2 * TypeByteSize)) { + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found not vectorizable store " + << "after load dependence (may prevent forwarding):\n" + << " Src: " << *SourceInst << "\n" + << " Dest: " << *DestInst << "\n"); + return false; + } + } + + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): The origin basic block:\n" + << *(SourceInst->getParent()) << "\n"); + + SmallVector CheckedInsts; + bool BackwardCanVectorize = + canMoveInstrs(SourceInst, DestInst, CheckedInsts, Strides); + + LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): canMoveInstrs returned: " + << BackwardCanVectorize << "\n"); + + if (BackwardCanVectorize) { + // The CheckedInsts may contain instructions that do not need + // move. E.g. + // - instruction in the for.preheader block used by DestInst + // - instruction already before SourceInst used by DestInst + for (auto *I : CheckedInsts) { + if (I->getParent() == SourceInst->getParent() && + SourceInst->comesBefore(I)) + ReorderNeededInstructionList.push_back(I); + } + } + + return BackwardCanVectorize; +} + bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const ValueToValueMap &Strides) { MaxSafeDepDistBytes = -1; + UnsafeBackwardMinDistGTOtherBackwardDist = false; SmallPtrSet Visited; for (MemAccessInfo CurAccess : CheckDeps) { if (Visited.count(CurAccess)) @@ -1712,8 +1985,19 @@ // unsafe dependence. This puts a limit on this quadratic // algorithm. if (RecordDependences) { - if (Type != Dependence::NoDep) - Dependences.push_back(Dependence(A.second, B.second, Type)); + SmallVector ReorderNeededInstructionList = + SmallVector(); + if (Type == Dependence::Backward && + !UnsafeBackwardMinDistGTOtherBackwardDist) { + isReorderingToForwardPossible(A.second, B.second, Strides, + ReorderNeededInstructionList); + } + + if (Type != Dependence::NoDep) { + Dependences.push_back( + Dependence(A.second, B.second, Type, + ReorderNeededInstructionList)); + } if (Dependences.size() >= MaxDependences) { RecordDependences = false; @@ -1735,6 +2019,25 @@ return isSafeForVectorization(); } +bool MemoryDepChecker::canUnsafeDepsVectorize() const { + if (UnsafeBackwardMinDistGTOtherBackwardDist) + return false; + + if (!getDependences()) + return false; + + for (const auto &Dep : Dependences) { + if (Dep.Type == Dependence::Backward && Dep.ReorderInfo.empty()) + return false; + else if (Dep.Type != Dependence::Backward && + Dependence::isSafeForVectorization(Dep.Type) != + VectorizationSafetyStatus::Safe) + return false; + } + + return true; +} + SmallVector MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const { MemAccessInfo Access(Ptr, isWrite); @@ -2070,6 +2373,16 @@ return; } + if(!CanVecMem){ + //Check that if all the dependences can be turned to safe by + //reordering instructions. + CanVecMem = DepChecker->canUnsafeDepsVectorize(); + LLVM_DEBUG( + dbgs() << "LAA: We can " + << (CanVecMem ? "" : "not ") + << "vectorize the loop by reordering instructions.\n"); + } + if (CanVecMem) LLVM_DEBUG( dbgs() << "LAA: No unsafe dependent memory operations in loop. We" diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9433,6 +9433,73 @@ F, &Hints, IAI); CM.collectValuesToIgnore(); + // Get the memory access reorder informations. + auto *LAI = LVL.getLAI(); + MemoryDepChecker Checker = LAI->getDepChecker(); + const auto &Dependences = Checker.getDependences(); + SmallVector>, 2> InstPairs; + for (const auto &Dep : *Dependences) { + if (Dep.Type != MemoryDepChecker::Dependence::Backward) + continue; + + InstPairs.push_back(std::make_pair(Dep.getSource(*LAI), Dep.ReorderInfo)); + } + + // The following sorts are important, they ensure the correctness + // of the move and the order of instructions after the move is + // more like that before the move. + for (auto &InstPair : InstPairs) { + SmallVector *Insts = + (SmallVector *)(&(InstPair.second)); + + llvm::sort( + Insts->begin(), Insts->end(), + [](Instruction *L, Instruction *R) { return L->comesBefore(R); }); + } + + llvm::sort(InstPairs.begin(), InstPairs.end(), + [](std::pair> L, + std::pair> R) { + Instruction *LInst = + ((SmallVector)L.second).front(); + Instruction *RInst = + ((SmallVector)R.second).front(); + return LInst->comesBefore(RInst); + }); + + // After the planner plans, these reorder will be reverted. + SmallPtrSet BlockPtrsSet; + SmallVector, 2> OriginBlocks; + for (auto InstPair : InstPairs) { + Instruction *SourceInst = (Instruction *)InstPair.first; + SmallVector Insts = + (SmallVector)InstPair.second; + + // Save the origin order for reverting + if (!BlockPtrsSet.contains(SourceInst->getParent())) { + BlockPtrsSet.insert(SourceInst->getParent()); + SmallVector OriginInstrs; + for (Instruction &I : *(SourceInst->getParent())) { + OriginInstrs.push_back(&I); + } + OriginBlocks.push_back(OriginInstrs); + } + + LLVM_DEBUG(dbgs() << "LV: Move instructions before: " << *SourceInst + << "\n"); + + for (Instruction *I : Insts) { + // SrcInst maybe already in the after of DestInst because of + // last moving, if so go to next + if (I->comesBefore(SourceInst)) + continue; + + LLVM_DEBUG(dbgs() << " " << *I << "\n"); + + I->moveBefore(SourceInst); + } + } + // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); @@ -9443,6 +9510,19 @@ // Plan how to best vectorize, return the best VF and its cost. Optional MaybeVF = LVP.plan(UserVF, UserIC); + // Revert the origin loop + for (auto OB : OriginBlocks) { + BasicBlock *BB = OB.front()->getParent(); + + LLVM_DEBUG(dbgs() << "LV: The reordered basic block:\n" << *BB << "\n"); + + // revert instructions orders + for (auto *I : OB) { + I->removeFromParent(); + BB->getInstList().push_back(I); + } + } + VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -145,7 +145,9 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup } -; int unsafe_Write_Read(int *A) { +; Following cases can be vectorized by reordering instructions. + +; int vectorizable_by_reorder_Write_Read(int *A) { ; int sum = 0; ; for (unsigned i = 0; i < 1024; i+=4) { ; A[i] = i; @@ -155,15 +157,15 @@ ; return sum; ; } -; CHECK: function 'unsafe_Write_Read': +; CHECK: function 'vectorizable_by_reorder_Write_Read': ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Backward: ; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> ; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 -define i32 @unsafe_Write_Read(i32* nocapture %A) { +define i32 @vectorizable_by_reorder_Write_Read(i32* nocapture %A) { entry: br label %for.body @@ -184,22 +186,22 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup } -; void unsafe_Write_Write(int *A) { +; void vectorizable_by_reorder_Write_Write(int *A) { ; for (unsigned i = 0; i < 1024; i+=2) { ; A[i] = i; ; A[i+2] = i+1; ; } ; } -; CHECK: function 'unsafe_Write_Write': +; CHECK: function 'vectorizable_by_reorder_Write_Write': ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Backward: ; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> ; CHECK-NEXT: store i32 %2, i32* %arrayidx3, align 4 -define void @unsafe_Write_Write(i32* nocapture %A) { +define void @vectorizable_by_reorder_Write_Write(i32* nocapture %A) { entry: br label %for.body @@ -490,7 +492,7 @@ } ; Following case checks that interleaved stores have dependences with another -; store and can not pass dependence check. +; store and can pass dependence check with reordering. ; void interleaved_stores(int *A) { ; int *B = (int *) ((char *)A + 1); @@ -505,7 +507,7 @@ ; CHECK: function 'interleaved_stores': ; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Backward: ; CHECK-NEXT: store i32 %4, i32* %arrayidx5, align 4 -> diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll --- a/llvm/test/Transforms/LoopVectorize/memdep.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep.ll @@ -60,6 +60,7 @@ } ; Plausible dependence of distance 2 - can be vectorized with a width of 2. +; And can be vectorized with a width of 4 with reordering accesses. ; for (i = 0; i < 1024; ++i) ; A[i+2] = A[i] + 1; @@ -67,7 +68,7 @@ ; CHECK: <2 x i32> ; WIDTH: f3_vec_len -; WIDTH-NOT: <4 x i32> +; WIDTH: <4 x i32> define void @f3_vec_len(i32* %A) { entry: @@ -91,7 +92,7 @@ ret void } -; Plausible dependence of distance 1 - cannot be vectorized (without reordering +; Plausible dependence of distance 1 - can be vectorized (with reordering ; accesses). ; for (i = 0; i < 1024; ++i) { ; B[i] = A[i]; @@ -99,7 +100,7 @@ ; } ; CHECK-LABEL: @f5( -; CHECK-NOT: <2 x i32> +; CHECK: <2 x i32> define void @f5(i32* %A, i32* %B) { entry: @@ -244,8 +245,10 @@ ; RIGHTVF-LABEL: @pr34283 ; RIGHTVF: <4 x i64> +;Before backward dependence can be vectorized, the following check +;is 'WRONGVF-NOT: <8 x i64>'. Now it can be vectorized. ; WRONGVF-LABLE: @pr34283 -; WRONGVF-NOT: <8 x i64> +; WRONGVF: <8 x i64> @a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16 diff --git a/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll @@ -0,0 +1,495 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; The C code corresponding to the following cases are as follows +; +; #define N 1024 +; +; float a[N], b[N], c[N], d[N], e[N], f[N]; +; +; // Loops that can be vectorized +; void foo(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +; void foo1(int LEN) { +; for (int i = LEN - 2; i > -1; i--) { +; a[i + 1] = c[i] * 10; +; b[i] = a[i] + 1; +; } +; } +; +; void foo2(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; d[i] = b[i + 1] - 2; +; } +; } +; +; void foo3(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = b[i] * 2; +; a[i + 1] = c[i] + 1; +; } +; } +; +; void foo4(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i] * 2; +; a[i + 2] = c[i] + 1; +; } +; } +; +; void foo5(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i] * 2; +; c[i + 2] = d[i]; +; a[i + 2] = c[i] + 1; +; } +; } +; +; // Loops that cann't be vectorized +; // Backward dependence belongs to different blocks. +; void bar(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; if (i % 2) +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +; // Backward dependence is true data dependence (operands check). +; void bar1(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i]; +; a[i + 2] = b[i]; +; } +; } +; +; // Backward dependence is true data dependence (related check). +; void bar2(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i]; +; c[i + 2] = b[i]; +; a[i + 2] = c[i]; +; } +; } +; +; // Backward dependence turns to forwardbutpreventsforwarding. +; void bar3(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; b[i] = a[i] * 2; +; a[i + 1] = c[i] + 1; +; } +; } +; +@a = dso_local global [1024 x float] zeroinitializer, align 4 +@b = dso_local global [1024 x float] zeroinitializer, align 4 +@c = dso_local global [1024 x float] zeroinitializer, align 4 +@d = dso_local global [1024 x float] zeroinitializer, align 4 +@e = dso_local global [1024 x float] zeroinitializer, align 4 +@f = dso_local global [1024 x float] zeroinitializer, align 4 + +define dso_local void @foo(i32 %LEN) #0 { +; CHECK-LABEL: @foo( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +entry: + %cmp6 = icmp sgt i32 %LEN, 1 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo1(i32 %LEN) #0 { +; CHECK-LABEL: @foo1( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: for.body +; CHECK: load float +; CHECK: fmul float +; CHECK: store float +; CHECK: load float +; CHECK: fadd float +; CHECK: store float +entry: + %cmp6 = icmp sgt i32 %LEN, 1 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %1 = sext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %1, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx, align 4 + %mul = fmul float %2, 1.000000e+01 + %3 = add nsw i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %4 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %4, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo2(i32 %LEN) #0 { +; CHECK-LABEL: @foo2( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; REVERT-LABEL: @foo2( +entry: + %cmp8 = icmp sgt i32 %LEN, 1 + br i1 %cmp8, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv.next + %3 = load float, float* %arrayidx10, align 4 + %sub11 = fadd float %3, -2.000000e+00 + %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv + store float %sub11, float* %arrayidx13, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo3(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo3( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: store <4 x float> +entry: + %cmp14 = icmp sgt i32 %LEN, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + store float %add, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @foo4(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo4( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: store <4 x float> +entry: + %cmp14 = icmp sgt i32 %LEN, 2 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %add, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @foo5(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo5( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: store <4 x float> +entry: + %cmp21 = icmp sgt i32 %LEN, 2 + br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %3 + store float %2, float* %arrayidx6, align 4 + %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %4 = load float, float* %arrayidx8, align 4 + %add9 = fadd float %4, 1.000000e+00 + %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %add9, float* %arrayidx12, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar(i32 %LEN) #0 { +; CHECK-LABEL: @bar( +; CHECK-NOT: vector.body +; CHECK: for.body +; CHECK: if.then +; CHECK: load float +; CHECK: fmul float +; CHECK: store float +; CHECK: if.end +; CHECK: load float +; CHECK: fadd float +; CHECK: store float +entry: + %cmp7 = icmp sgt i32 %LEN, 1 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %if.end + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ] + %rem10 = and i64 %indvars.iv, 1 + %tobool.not = icmp eq i64 %rem10, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %if.end, %entry + ret void +} + +define dso_local void @bar1(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar1( +; CHECK-NOT: vector.body +entry: + %cmp13 = icmp sgt i32 %LEN, 2 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2 + store float %1, float* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar2(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar2( +; CHECK-NOT: vector.body +entry: + %cmp20 = icmp sgt i32 %LEN, 2 + br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %2 + store float %1, float* %arrayidx6, align 4 + %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %3 = load float, float* %arrayidx8, align 4 + %arrayidx11 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2 + store float %3, float* %arrayidx11, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar3(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar3( +; CHECK-NOT: vector.body +entry: + %cmp14 = icmp sgt i32 %LEN, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + store float %add, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}