diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -215,6 +215,13 @@ /// the accesses safely with. uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// True if there is an unsafe backward dependence whose + /// minmum safe distance is greater than another backward dependence's + /// distance which is already known. + bool isUnsafeBackwardMinDistGTOtherBackwardDist() const { + return UnsafeBackwardMinDistGTOtherBackwardDist; + } + /// Return the number of elements that are safe to operate on /// simultaneously, multiplied by the size of the element in bits. uint64_t getMaxSafeVectorWidthInBits() const { @@ -280,6 +287,11 @@ // We can access this many bytes in parallel safely. uint64_t MaxSafeDepDistBytes; + /// True if there is an unsafe backward dependence whose + /// minmum safe distance is greater than another backward dependence's + /// distance which is already known. + bool UnsafeBackwardMinDistGTOtherBackwardDist; + /// Number of elements (from consecutive iterations) that are safe to /// operate on simultaneously, multiplied by the size of the element in bits. /// The size of the element is taken from the memory access that is most diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -235,6 +235,17 @@ /// inductions and reductions. using RecurrenceSet = SmallPtrSet; + /// ReorderList contains the instructions which should be reordered + /// before vectorization. + /// + /// The items are in the following format. + /// + /// std::pair + /// .first --> a load/store instruction + /// .second --> a list of instructions to be move before the first + using ReorderList = + SmallVector>, 2>; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -350,6 +361,19 @@ return ConditionalAssumes; } + /// Returns the ReorderNeededInstructionPairs list. + /// + /// The ReorderNeededInstructionPairs list is filled if + /// canVectorizeMemory found unsafe memory accesses while the + /// accesses can be changed to safe by reordering the involved + /// instructions. Also see canUnsafeDepsVectorize. + /// + /// The pairs are sorted by the first field(llvm::Instruction). + /// And each second field(llvm::SmallVector) is also sorted. + ReorderList &getReorderNeededInstructionPairs() { + return ReorderNeededInstructionPairs; + } + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -384,6 +408,39 @@ /// Returns true if the loop is vectorizable bool canVectorizeMemory(); + /// Returns true if the unsafe memory accesses reported by + /// MemoryDepChecker can be changed to safe by reordering the + /// involved instructions. + /// + /// This method may set the involved instruction pairs to the + /// ReorderNeededInstructionPairs list, and the client should + /// reorder each instruction pair before vectorization. Also + /// see getReorderNeededInstructionPairs. + bool canUnsafeDepsVectorize(); + + /// Returns true if the DESTINST instruction can be moved before + /// the SRCINST instruction, and add all the involved instructions + /// to the INSTS list. + /// + /// This method do the following checks recursively. + /// step1. call canMoveInstrs to check if all the operands of + /// DESTINST can be moved + /// step2. call canMoveBetweenInstrs to check if the instructions + /// between SRCINST and DESTINST which read/write the same + /// underly object as DESTINST does can be moved. + bool canMoveInstrs(llvm::Instruction *SrcInst, llvm::Instruction *DestInst, + SmallVector &Insts); + + /// Returns true if the instructions between SRCINST and DESTINST + /// which read/write the same underly object as DESTINST does can + /// be moved before the SRCINST instruction. + /// + /// This method is called by canMoveInstrs and calls canMoveInstrs + /// too. Also see canMoveInstrs. + bool canMoveBetweenInstrs(llvm::Instruction *SrcInst, + llvm::Instruction *DestInst, + SmallVector &Insts); + /// Return true if we can vectorize this loop using the IF-conversion /// transformation. bool canVectorizeWithIfConvert(); @@ -514,6 +571,10 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Holds instruction pairs that need to reorder. + /// Also see canUnsafeDepsVectorize. + ReorderList ReorderNeededInstructionPairs; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -124,7 +124,7 @@ /// Enable store-to-load forwarding conflict detection. This option can /// be disabled for correctness testing. -static cl::opt EnableForwardingConflictDetection( +cl::opt EnableForwardingConflictDetection( "store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true)); @@ -1623,6 +1623,7 @@ if (MinDistanceNeeded > MaxSafeDepDistBytes) { LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " << MinDistanceNeeded << " size in bytes"); + UnsafeBackwardMinDistGTOtherBackwardDist = true; return Dependence::Backward; } @@ -1663,6 +1664,7 @@ const ValueToValueMap &Strides) { MaxSafeDepDistBytes = -1; + UnsafeBackwardMinDistGTOtherBackwardDist = false; SmallPtrSet Visited; for (MemAccessInfo CurAccess : CheckDeps) { if (Visited.count(CurAccess)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -32,6 +32,7 @@ #define DEBUG_TYPE LV_NAME extern cl::opt EnableVPlanPredication; +extern cl::opt EnableForwardingConflictDetection; static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, @@ -876,15 +877,21 @@ bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &(*GetLAA)(*TheLoop); - const OptimizationRemarkAnalysis *LAR = LAI->getReport(); - if (LAR) { - ORE->emit([&]() { - return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - }); - } - if (!LAI->canVectorizeMemory()) + + // LoopAccessInfo checks the dependences base on source order, + // while canUnsafeDepsVectorize checks the dependences which are + // reported unsafe base on possible IR reordering. + if (!LAI->canVectorizeMemory() && !canUnsafeDepsVectorize()) { + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + }); + } + return false; + } if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("Stores to a uniform address", @@ -898,6 +905,343 @@ return true; } +bool LoopVectorizationLegality::canMoveInstrs( + Instruction *SrcInst, Instruction *DestInst, + SmallVector &Insts) { + LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: " << *SrcInst << "\n" + << " <---> " << *DestInst << "\n"); + + bool CanMove = true; + unsigned OpNums; + + // Recursive exit 1: already checked + if (std::find(Insts.begin(), Insts.end(), DestInst) != Insts.end()) + return true; + else + Insts.push_back(DestInst); + + // Recursive exit 2: the DestInst is same as SrcInst + if (DestInst == SrcInst) { + LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: The source instruction and " + << "destination instruction are same!!!\n"); + return false; + } + + // Recursive exit 3: Both access the same address + // In the process of recursion, the DestInst and StrInst may operate + // on the same address. + if (isa(DestInst) || isa(DestInst)) { + Value *SrcPtr, *DestPtr; + + if (isa(SrcInst)) + SrcPtr = cast(SrcInst)->getPointerOperand(); + else + SrcPtr = cast(SrcInst)->getPointerOperand(); + + if (isa(DestInst)) + DestPtr = cast(DestInst)->getPointerOperand(); + else + DestPtr = cast(DestInst)->getPointerOperand(); + + if (SrcPtr == DestPtr) { + LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: Store/Load instructions access " + << "same address!!!\n"); + return false; + } + } + + // STEP1: Check the operands + OpNums = DestInst->getNumOperands(); + for (unsigned Idx = 0; Idx < OpNums; Idx++) { + Instruction *OpInst = dyn_cast(DestInst->getOperand(Idx)); + + // Maybe a llvm::Value, E.g.: + // a constant + // a parameter variant + // a global variant + if (!OpInst) + continue; + + BasicBlock *OpBB = OpInst->getParent(); + BasicBlock *DestBB = DestInst->getParent(); + // This basic block may use instructions of other basic block. + // And the other basic block may also use instruction of this + // basic block, so go on checking. + if (OpBB != DestBB) { + // Use instruction out of current Function + // FIXME: Don't know if this is necessary. + if (OpBB->getParent() != DestBB->getParent()) + continue; + + // FIXME: Don't know if there is any situation that can stop + // the checking here. + } + + LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: Operand Instructions' Check\n"); + CanMove = canMoveInstrs(SrcInst, OpInst, Insts); + if (!CanMove) + return false; + } + + // STEP2: Check the instructions between the SrcInst and DestInst. + if (SrcInst->getParent() == DestInst->getParent() && + SrcInst->comesBefore(DestInst)) + return canMoveBetweenInstrs(SrcInst, DestInst, Insts); + else + return true; +} + +bool LoopVectorizationLegality::canMoveBetweenInstrs( + Instruction *SrcInst, Instruction *DestInst, + SmallVector &Insts) { + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: " << *SrcInst << "\n" + << " <---> " << *DestInst << "\n"); + + bool CanMove = true; + + Value *Ptr = nullptr; + if (isa(DestInst)) + Ptr = cast(DestInst)->getPointerOperand(); + else if (isa(DestInst)) + Ptr = cast(DestInst)->getPointerOperand(); + + // DestInst is not load/store instruction. + if (!Ptr) + return true; + + const ValueToValueMap &Strides = + getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); + int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, true); + Value *UnderlyingObject = getUnderlyingObject(Ptr, 6); + + for (BasicBlock::iterator it = --(DestInst->getIterator()), + begin = SrcInst->getIterator(); + it != begin; --it) { + // The current instruction maybe already checked as operand. + if (std::find(Insts.begin(), Insts.end(), (&*it)) != Insts.end()) + continue; + + // FIXME: Don't know how to check call instruction that + // read/wirte memory. + if (isa(&*it) && + ((&*it)->mayReadFromMemory() || (&*it)->mayWriteToMemory())) { + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Found memory access " + << "call: " << (*it) << "\n"); + return false; + } + + if (isa(&*it) || isa(&*it)) { + Value *PtrTemp = nullptr; + if (isa(&*it)) + PtrTemp = cast(&*it)->getPointerOperand(); + else if (isa(&*it)) + PtrTemp = cast(&*it)->getPointerOperand(); + + Value *UnderlyingObjectTemp = getUnderlyingObject(PtrTemp, 6); + + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs:\n" + << " UnderlyingObject: " << *UnderlyingObject << "\n" + << " UnderlyingObjectTemp: " << *UnderlyingObjectTemp + << "\n"); + + // For some reason the return value of getUnderlyingObject + // function is still an Instruction. So the following if + // statement maybe false, and the current instruction does not + // need move before. This is an assumption. If the current + // instruction do need move before, that says there is a + // memory conflict between the current instruction and the + // DestInst. And the runtime memory check can detect the + // conflict, then go to run the origin scalar loop. + if (UnderlyingObjectTemp == UnderlyingObject) { + const SCEV *First = PSE.getSCEV(PtrTemp); + const SCEV *Second = PSE.getSCEV(Ptr); + const SCEV *Dist = PSE.getSE()->getMinusSCEV(Second, First); + const SCEVConstant *C = dyn_cast(Dist); + + if (!C) { + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Found non constant " + << "distance!!!\n"); + return false; + } + + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Distance between:\n" + << " " << *PtrTemp << "\n" + << " " << *Ptr << "\n" + << " is " << Distance << "\n"); + + // Related instruction needs to be moved before together, so + // check if it can be moved. E.g. + // + // Stride > 0 + // SrcInst: c[i] = a[i]; + // RelatedInst: b[i + 1] = c[i]; + // DestInst: a[i + 1] = b[i]; + // + // Stride < 0 + // SrcInst: c[i] = a[i]; + // RelatedInst: b[i - 1] = c[i]; + // DestInst: a[i - 1] = b[i]; + if ((Stride > 0 && Distance <= 0) || (Stride < 0 && Distance >= 0)) { + LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Related " + << "Instrctions' Check\n"); + CanMove = canMoveInstrs(SrcInst, &*it, Insts); + if (!CanMove) + return false; + } + } + } + } + + return CanMove; +} + +bool LoopVectorizationLegality::canUnsafeDepsVectorize() { + bool BackwardCanVectorize = true; + SmallVector BackwardDeps; + + if (LAI->hasConvergentOp()) + return false; + + MemoryDepChecker Checker = LAI->getDepChecker(); + if (Checker.isUnsafeBackwardMinDistGTOtherBackwardDist()) { + return false; + } + + const auto &Dependences = Checker.getDependences(); + if (Dependences == nullptr || Dependences->empty()) + return false; + + for (const auto &Dep : *Dependences) { + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: " + << "Unsafe dependence type is: " << Dep.Type << "\n"); + + // If the Dep.Type is not Backward, go to check the next. + if (Dep.Type != MemoryDepChecker::Dependence::Backward && + MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) != + MemoryDepChecker::VectorizationSafetyStatus::Safe) + return false; + else if (Dep.Type == MemoryDepChecker::Dependence::Backward) + BackwardDeps.push_back(Dep); + } + + // Check backward dependences if involved instructions can be move. + for (const auto &Dep : BackwardDeps) { + Instruction *SourceInst = Dep.getSource(*LAI); + Instruction *DestInst = Dep.getDestination(*LAI); + + if (SourceInst->getParent() != DestInst->getParent()) { + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: " + << "Src: " << *SourceInst << " Dest: " << *DestInst + << " are in different basic blocks\n"); + + // Move instruction between basic blocks is not supported. + BackwardCanVectorize = false; + break; + } + + // For the following case, if the DestInst is moved before the + // SourceInst, it should not prevent store-load forwarding. + // SourceInst: load a[i] + // DestInst: store a[i + n] + if (isa(SourceInst) && isa(DestInst) && + EnableForwardingConflictDetection) { + // Get the distance between the SourceInst and the DestInst. + Value *SourcePtr = cast(SourceInst)->getPointerOperand(); + Value *DestPtr = cast(DestInst)->getPointerOperand(); + Type *SourceTy = SourcePtr->getType()->getPointerElementType(); + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + uint64_t TypeByteSize = DL.getTypeAllocSize(SourceTy); + const SCEV *Source = PSE.getSCEV(SourcePtr); + const SCEV *Dest = PSE.getSCEV(DestPtr); + const SCEV *Dist = PSE.getSE()->getMinusSCEV(Dest, Source); + const SCEVConstant *C = dyn_cast(Dist); + + if (!C) { + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: Found not " + << "vectorizable store after load backward " + << "dependence (non consistant distance):\n" + << " " << *SourceInst << "\n" + << " " << *DestInst << "\n"); + BackwardCanVectorize = false; + break; + } + + const APInt &Val = C->getAPInt(); + int64_t Distance = Val.getSExtValue(); + + // Do a check as MemoryDepChecker::couldPreventStoreLoadForward. + // + // The difference is that, here the distance is little than the + // minimum distance that vectorization need, so just need the + // following check. + if (Distance % (2 * TypeByteSize)) { + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: Found not " + << "vectorizable store after load backward " + << "dependence (may prevent forwarding):\n" + << " " << *SourceInst << "\n" + << " " << *DestInst << "\n"); + BackwardCanVectorize = false; + break; + } + } + + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: The origin basic block:\n" + << *(SourceInst->getParent()) << "\n"); + + SmallVector CheckedInsts; + BackwardCanVectorize = canMoveInstrs(SourceInst, DestInst, CheckedInsts); + LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: canMoveInstrs return: " + << BackwardCanVectorize << "\n"); + + if (!BackwardCanVectorize) + break; + + // The CheckedInsts may contain instructions that do not need + // move. E.g. + // - instruction in the for.preheader block used by DestInst + // - instruction already before SourceInst used by DestInst + SmallVector ReorderNeededInstructionList; + for (auto *I : CheckedInsts) { + if (I->getParent() == SourceInst->getParent() && + SourceInst->comesBefore(I)) + ReorderNeededInstructionList.push_back(I); + } + ReorderNeededInstructionPairs.push_back( + std::make_pair(SourceInst, ReorderNeededInstructionList)); + } + + if (!BackwardCanVectorize) { + ReorderNeededInstructionPairs.clear(); + } else { + // The following sorts are important, they ensure the correctness + // of the move and the order of instructions after the move is + // more like that before the move. + for (auto &InstPair : ReorderNeededInstructionPairs) { + SmallVector *Insts = + (SmallVector *)(&(InstPair.second)); + + llvm::sort( + Insts->begin(), Insts->end(), + [](Instruction *L, Instruction *R) { return L->comesBefore(R); }); + } + + llvm::sort(ReorderNeededInstructionPairs.begin(), + ReorderNeededInstructionPairs.end(), + [](std::pair> L, + std::pair> R) { + Instruction *LInst = + ((SmallVector)L.second).front(); + Instruction *RInst = + ((SmallVector)R.second).front(); + return LInst->comesBefore(RInst); + }); + } + + return BackwardCanVectorize; +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9433,6 +9433,43 @@ F, &Hints, IAI); CM.collectValuesToIgnore(); + // Reorder the memory access instructions which are reported unsafe + // by MemoryDepChecker but LoopVectorizationLegality say that they + // can be changed to safe. + // + // After the planner plans, these reorder will be reverted. + SmallPtrSet BlockPtrsSet; + SmallVector, 2> OriginBlocks; + for (auto InstPair : LVL.getReorderNeededInstructionPairs()) { + Instruction *SourceInst = (Instruction *)InstPair.first; + SmallVector Insts = + (SmallVector)InstPair.second; + + // Save the origin order for reverting + if (!BlockPtrsSet.contains(SourceInst->getParent())) { + BlockPtrsSet.insert(SourceInst->getParent()); + SmallVector OriginInstrs; + for (Instruction &I : *(SourceInst->getParent())) { + OriginInstrs.push_back(&I); + } + OriginBlocks.push_back(OriginInstrs); + } + + LLVM_DEBUG(dbgs() << "LV: Move instructions before: " << *SourceInst + << "\n"); + + for (Instruction *I : Insts) { + // SrcInst maybe already in the after of DestInst because of + // last moving, if so go to next + if (I->comesBefore(SourceInst)) + continue; + + LLVM_DEBUG(dbgs() << " " << *I << "\n"); + + I->moveBefore(SourceInst); + } + } + // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); @@ -9443,6 +9480,19 @@ // Plan how to best vectorize, return the best VF and its cost. Optional MaybeVF = LVP.plan(UserVF, UserIC); + // Revert the origin loop + for (auto OB : OriginBlocks) { + BasicBlock *BB = OB.front()->getParent(); + + LLVM_DEBUG(dbgs() << "LV: The reordered basic block:\n" << *BB << "\n"); + + // revert instructions orders + for (auto *I : OB) { + I->removeFromParent(); + BB->getInstList().push_back(I); + } + } + VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll --- a/llvm/test/Transforms/LoopVectorize/memdep.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep.ll @@ -60,6 +60,7 @@ } ; Plausible dependence of distance 2 - can be vectorized with a width of 2. +; And can be vectorized with a width of 4 with reordering accesses. ; for (i = 0; i < 1024; ++i) ; A[i+2] = A[i] + 1; @@ -67,7 +68,7 @@ ; CHECK: <2 x i32> ; WIDTH: f3_vec_len -; WIDTH-NOT: <4 x i32> +; WIDTH: <4 x i32> define void @f3_vec_len(i32* %A) { entry: @@ -91,7 +92,7 @@ ret void } -; Plausible dependence of distance 1 - cannot be vectorized (without reordering +; Plausible dependence of distance 1 - can be vectorized (with reordering ; accesses). ; for (i = 0; i < 1024; ++i) { ; B[i] = A[i]; @@ -99,7 +100,7 @@ ; } ; CHECK-LABEL: @f5( -; CHECK-NOT: <2 x i32> +; CHECK: <2 x i32> define void @f5(i32* %A, i32* %B) { entry: @@ -244,8 +245,10 @@ ; RIGHTVF-LABEL: @pr34283 ; RIGHTVF: <4 x i64> +;Before backward dependence can be vectorized, the following check +;is 'WRONGVF-NOT: <8 x i64>'. Now it can be vectorized. ; WRONGVF-LABLE: @pr34283 -; WRONGVF-NOT: <8 x i64> +; WRONGVF: <8 x i64> @a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16 diff --git a/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll @@ -0,0 +1,495 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; The C code corresponding to the following cases are as follows +; +; #define N 1024 +; +; float a[N], b[N], c[N], d[N], e[N], f[N]; +; +; // Loops that can be vectorized +; void foo(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +; void foo1(int LEN) { +; for (int i = LEN - 2; i > -1; i--) { +; a[i + 1] = c[i] * 10; +; b[i] = a[i] + 1; +; } +; } +; +; void foo2(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; d[i] = b[i + 1] - 2; +; } +; } +; +; void foo3(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = b[i] * 2; +; a[i + 1] = c[i] + 1; +; } +; } +; +; void foo4(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i] * 2; +; a[i + 2] = c[i] + 1; +; } +; } +; +; void foo5(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i] * 2; +; c[i + 2] = d[i]; +; a[i + 2] = c[i] + 1; +; } +; } +; +; // Loops that cann't be vectorized +; // Backward dependence belongs to different blocks. +; void bar(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; if (i % 2) +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +; // Backward dependence is true data dependence (operands check). +; void bar1(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i]; +; a[i + 2] = b[i]; +; } +; } +; +; // Backward dependence is true data dependence (related check). +; void bar2(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; b[i] = a[i]; +; c[i + 2] = b[i]; +; a[i + 2] = c[i]; +; } +; } +; +; // Backward dependence turns to forwardbutpreventsforwarding. +; void bar3(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; b[i] = a[i] * 2; +; a[i + 1] = c[i] + 1; +; } +; } +; +@a = dso_local global [1024 x float] zeroinitializer, align 4 +@b = dso_local global [1024 x float] zeroinitializer, align 4 +@c = dso_local global [1024 x float] zeroinitializer, align 4 +@d = dso_local global [1024 x float] zeroinitializer, align 4 +@e = dso_local global [1024 x float] zeroinitializer, align 4 +@f = dso_local global [1024 x float] zeroinitializer, align 4 + +define dso_local void @foo(i32 %LEN) #0 { +; CHECK-LABEL: @foo( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +entry: + %cmp6 = icmp sgt i32 %LEN, 1 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo1(i32 %LEN) #0 { +; CHECK-LABEL: @foo1( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: for.body +; CHECK: load float +; CHECK: fmul float +; CHECK: store float +; CHECK: load float +; CHECK: fadd float +; CHECK: store float +entry: + %cmp6 = icmp sgt i32 %LEN, 1 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %1 = sext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %1, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx, align 4 + %mul = fmul float %2, 1.000000e+01 + %3 = add nsw i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %4 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %4, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo2(i32 %LEN) #0 { +; CHECK-LABEL: @foo2( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; REVERT-LABEL: @foo2( +entry: + %cmp8 = icmp sgt i32 %LEN, 1 + br i1 %cmp8, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv.next + %3 = load float, float* %arrayidx10, align 4 + %sub11 = fadd float %3, -2.000000e+00 + %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv + store float %sub11, float* %arrayidx13, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define dso_local void @foo3(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo3( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: store <4 x float> +entry: + %cmp14 = icmp sgt i32 %LEN, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + store float %add, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @foo4(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo4( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: store <4 x float> +entry: + %cmp14 = icmp sgt i32 %LEN, 2 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %add, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @foo5(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @foo5( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: store <4 x float> +entry: + %cmp21 = icmp sgt i32 %LEN, 2 + br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %3 + store float %2, float* %arrayidx6, align 4 + %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %4 = load float, float* %arrayidx8, align 4 + %add9 = fadd float %4, 1.000000e+00 + %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3 + store float %add9, float* %arrayidx12, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar(i32 %LEN) #0 { +; CHECK-LABEL: @bar( +; CHECK-NOT: vector.body +; CHECK: for.body +; CHECK: if.then +; CHECK: load float +; CHECK: fmul float +; CHECK: store float +; CHECK: if.end +; CHECK: load float +; CHECK: fadd float +; CHECK: store float +entry: + %cmp7 = icmp sgt i32 %LEN, 1 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %if.end + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ] + %rem10 = and i64 %indvars.iv, 1 + %tobool.not = icmp eq i64 %rem10, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 1.000000e+01 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + %2 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %2, 1.000000e+00 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %add5, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %if.end, %entry + ret void +} + +define dso_local void @bar1(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar1( +; CHECK-NOT: vector.body +entry: + %cmp13 = icmp sgt i32 %LEN, 2 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2 + store float %1, float* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar2(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar2( +; CHECK-NOT: vector.body +entry: + %cmp20 = icmp sgt i32 %LEN, 2 + br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -2 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %2 + store float %1, float* %arrayidx6, align 4 + %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %3 = load float, float* %arrayidx8, align 4 + %arrayidx11 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2 + store float %3, float* %arrayidx11, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @bar3(i32 %LEN) local_unnamed_addr #0 { +; CHECK-LABEL: @bar3( +; CHECK-NOT: vector.body +entry: + %cmp14 = icmp sgt i32 %LEN, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = add i32 %LEN, -1 + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %1, 2.000000e+00 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %2, 1.000000e+00 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next + store float %add, float* %arrayidx7, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}