Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -215,6 +215,13 @@ /// the accesses safely with. uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// True if there is an unsafe backward dependence whose + /// minmum safe distance is greater than another backward dependence's + /// distance which is already known. + bool isUnsafeBackwardMinDistGTOtherBackwardDist() const { + return UnsafeBackwardMinDistGTOtherBackwardDist; + } + /// Return the number of elements that are safe to operate on /// simultaneously, multiplied by the size of the element in bits. uint64_t getMaxSafeVectorWidthInBits() const { @@ -280,6 +287,11 @@ // We can access this many bytes in parallel safely. uint64_t MaxSafeDepDistBytes; + /// True if there is an unsafe backward dependence whose + /// minmum safe distance is greater than another backward dependence's + /// distance which is already known. + bool UnsafeBackwardMinDistGTOtherBackwardDist; + /// Number of elements (from consecutive iterations) that are safe to /// operate on simultaneously, multiplied by the size of the element in bits. /// The size of the element is taken from the memory access that is most Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -235,6 +235,10 @@ /// inductions and reductions. using RecurrenceSet = SmallPtrSet; + /// ReorderList contains the load/store instruction pairs which + /// should be reordered before vectorization. + using ReorderList = SmallVector, 2>; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -353,6 +357,18 @@ return ConditionalAssumes; } + /// Returns the ReorderNeededInstructionPairs list. + /// + /// The ReorderNeededInstructionPairs list is filled if + /// canVectorizeMemory found unsafe memory accesses while the + /// accesses can be changed to safe by reordering the involved + /// instructions. + /// + /// Also see canUnsafeDepsVectorize. + ReorderList &getReorderNeededInstructionPairs() { + return ReorderNeededInstructionPairs; + } + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -387,6 +403,23 @@ /// Returns true if the loop is vectorizable bool canVectorizeMemory(); + /// Returns true if the unsafe memory accesses reported by + /// MemoryDepChecker can be changed to safe by reordering the + /// involved instructions. + /// + /// This method may set the involved instruction pairs to the + /// ReorderNeededInstructionPairs list, and the client should + /// reorder each instruction pair before vectorization. Now only + /// the following pattern will be set. And IR reorder maybe move + /// the second instruction and it's dependences to the front of the + /// first. + /// + /// + /// + /// If there are other unsafe patterns, it will return false and + /// ReorderNeededInstructionPairs list will be empty. + bool canUnsafeDepsVectorize(); + /// Return true if we can vectorize this loop using the IF-conversion /// transformation. bool canVectorizeWithIfConvert(); @@ -520,6 +553,10 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Holds instruction pairs that need to reorder. + /// Also see canUnsafeDepsVectorize. + ReorderList ReorderNeededInstructionPairs; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1623,6 +1623,7 @@ if (MinDistanceNeeded > MaxSafeDepDistBytes) { LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " << MinDistanceNeeded << " size in bytes"); + UnsafeBackwardMinDistGTOtherBackwardDist = true; return Dependence::Backward; } @@ -1663,6 +1664,7 @@ const ValueToValueMap &Strides) { MaxSafeDepDistBytes = -1; + UnsafeBackwardMinDistGTOtherBackwardDist = false; SmallPtrSet Visited; for (MemAccessInfo CurAccess : CheckDeps) { if (Visited.count(CurAccess)) Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -874,15 +874,21 @@ bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &(*GetLAA)(*TheLoop); - const OptimizationRemarkAnalysis *LAR = LAI->getReport(); - if (LAR) { - ORE->emit([&]() { - return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - }); - } - if (!LAI->canVectorizeMemory()) + + // LoopAccessInfo checks the dependences base on source order, + // while canUnsafeDepsVectorize checks the dependences which are + // reported unsafe base on possible IR reordering. + if (!LAI->canVectorizeMemory() && !canUnsafeDepsVectorize()) { + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + }); + } + return false; + } if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("Stores to a uniform address", @@ -896,6 +902,60 @@ return true; } +bool LoopVectorizationLegality::canUnsafeDepsVectorize() { + bool BackwardCanVectorize = true; + Instruction *SourceInst, *DestInst; + + if (LAI->hasConvergentOp()) + return false; + + MemoryDepChecker depChecker = LAI->getDepChecker(); + if (depChecker.isUnsafeBackwardMinDistGTOtherBackwardDist()) { + return false; + } + + const auto &Dependences = depChecker.getDependences(); + if (Dependences == nullptr || Dependences->empty()) + return false; + + for (const auto &Dep : *Dependences) { + if (!BackwardCanVectorize) + break; + + // If the Dep.Type is not Backward, go to check the next. + if (Dep.Type != MemoryDepChecker::Dependence::Backward) { + BackwardCanVectorize &= + MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) == + MemoryDepChecker::VectorizationSafetyStatus::Safe; + continue; + } + + SourceInst = Dep.getSource(*LAI); + DestInst = Dep.getDestination(*LAI); + + if (SourceInst->getParent() != DestInst->getParent()) { + // Move instruction between basic blocks is not supported. + BackwardCanVectorize = false; + } else { + // Here checks the source and destination instruction for + // load after store to let it can be vectorized. + // FIXME: There are other patterns that can be vectorized, + // but will need data dependency check, and will make + // reordering complex. + if (isa(SourceInst) && isa(DestInst)) + ReorderNeededInstructionPairs.push_back( + std::make_pair(SourceInst, DestInst)); + else + BackwardCanVectorize = false; + } + } + + if (!BackwardCanVectorize) + ReorderNeededInstructionPairs.clear(); + + return BackwardCanVectorize; +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1921,6 +1921,43 @@ collectSupportedLoops(*InnerL, LI, ORE, V); } +/// Move DestInst instruction and it's dependences to the head of +/// SourceInst instruction. +static void moveDestInstBeforeSourceInst(Instruction *SourceInst, + Instruction *DestInst) { + + if (DestInst->getParent() != SourceInst->getParent()) + return; + + if (DestInst->comesBefore(SourceInst)) + return; + + for(auto &U : DestInst->operands()){ + if(isa(U)){ + moveDestInstBeforeSourceInst(SourceInst, cast(U)); + } + } + + DestInst->moveBefore(SourceInst); +} + +/// Revert the instructions order according to OriginBlocks. +/// +/// OriginBlocks stored the original instructions list, +/// the instructions have been reorder in their basic blocks. +static void revertReorderedBBs( + SmallVector, 2> OriginBlocks) { + for (auto OB : OriginBlocks) { + BasicBlock *BB = OB.front()->getParent(); + + // revert instructions orders + for (auto I : OB) { + I->removeFromParent(); + BB->getInstList().push_back(I); + } + } +} + namespace { /// The LoopVectorize Pass. @@ -9238,6 +9275,33 @@ F, &Hints, IAI); CM.collectValuesToIgnore(); + // Reorder the memory access instructions which are reported unsafe + // by MemoryDepChecker but LoopVectorizationLegality say that they + // can be changed to safe. + SmallPtrSet BlockPtrsSet; + SmallVector, 2> OriginBlocks; + for (auto InstPair : LVL.getReorderNeededInstructionPairs()) { + Instruction *SourceInst = (Instruction *)InstPair.first; + Instruction *DestInst = (Instruction *)InstPair.second; + + LLVM_DEBUG(dbgs() << "Reordering src: " << *SourceInst + << " dest: " << *DestInst << "\n"); + + // Save the origin order for reverting + if (!BlockPtrsSet.contains(SourceInst->getParent())) { + BlockPtrsSet.insert(SourceInst->getParent()); + SmallVector OriginInstrs; + for (Instruction &I : *(SourceInst->getParent())) { + OriginInstrs.push_back(&I); + } + OriginBlocks.push_back(OriginInstrs); + } + + // Now the instructions are only the following 1 case. + // 1. SourceInst = StoreInst, DestInst = LoadInst + moveDestInstBeforeSourceInst(SourceInst, DestInst); + } + // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); @@ -9264,6 +9328,10 @@ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " "requirements.\n"); Hints.emitRemarkWithHints(); + + if (!OriginBlocks.empty()) + revertReorderedBBs(OriginBlocks); + return false; } @@ -9310,6 +9378,9 @@ // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + if (!VectorizeLoop && !OriginBlocks.empty()) + revertReorderedBBs(OriginBlocks); + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { Index: llvm/test/Transforms/LoopVectorize/memdep.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/memdep.ll +++ llvm/test/Transforms/LoopVectorize/memdep.ll @@ -91,38 +91,6 @@ ret void } -; Plausible dependence of distance 1 - cannot be vectorized (without reordering -; accesses). -; for (i = 0; i < 1024; ++i) { -; B[i] = A[i]; -; A[i] = B[i + 1]; -; } - -; CHECK-LABEL: @f5( -; CHECK-NOT: <2 x i32> - -define void @f5(i32* %A, i32* %B) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - store i32 %0, i32* %arrayidx2, align 4 - %indvars.iv.next = add nsw i64 %indvars.iv, 1 - %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next - %1 = load i32, i32* %arrayidx4, align 4 - store i32 %1, i32* %arrayidx, align 4 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp ne i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} - ; Dependence through a phi node - must not vectorize. ; for (i = 0; i < 1024; ++i) { ; a[i+1] = tmp; Index: llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-load-after-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-load-after-store.ll @@ -0,0 +1,550 @@ +; RUN: opt < %s -O2 -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s +; RUN: opt < %s -O2 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=REVERT + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; The C code corresponding to the following cases are as follows +; +; #define N 1024 +; +; float a[N], b[N], c[N], d[N], e[N], f[N]; +; +; // Loops that can be vectorized +; void foo(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +; void foo1(int LEN) { +; for (int i = LEN - 2; i > -1; i--) { +; a[i + 1] = c[i] * 10; +; b[i] = a[i] + 1; +; } +; } +; +; void foo2(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 2] + 1; +; } +; } +; +; void foo3(int LEN) { +; for (int i = 1; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; a[i - 1] = d[i] * 11; +; b[i] = a[i + 1] + 1; +; } +; } +; +; void foo4(int LEN) { +; for (int i = 0; i < LEN - 2; i++) { +; if (i % 2) { +; a[i] = c[i] * 10; +; b[i] = a[i + 2] + 1; +; } else { +; d[i] = e[i] + 11; +; f[i] = d[i + 2] * 2; +; } +; } +; } +; +; void foo5(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; d[i] = b[i + 1] - 2; +; } +; } +; +; // Loops that cann't be vectorized +; void bar(int LEN) { +; for (int i = 0; i < LEN - 1; i++) { +; if (i % 2) +; a[i] = c[i] * 10; +; b[i] = a[i + 1] + 1; +; } +; } +; +@a = dso_local global [1024 x float] zeroinitializer, align 4 +@b = dso_local global [1024 x float] zeroinitializer, align 4 +@c = dso_local global [1024 x float] zeroinitializer, align 4 +@d = dso_local global [1024 x float] zeroinitializer, align 4 +@e = dso_local global [1024 x float] zeroinitializer, align 4 +@f = dso_local global [1024 x float] zeroinitializer, align 4 + +define dso_local void @foo(i32 %LEN) #0 { +; CHECK-LABEL: @foo( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 1 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %3 = load float, float* %arrayidx, align 4 + %mul = fmul float %3, 1.000000e+01 + %4 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %4 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %5 = load i32, i32* %i, align 4 + %add = add nsw i32 %5, 1 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %6, 1.000000e+00 + %7 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %7 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %8 = load i32, i32* %i, align 4 + %inc = add nsw i32 %8, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @foo1(i32 %LEN) #0 { +; CHECK-LABEL: @foo1( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; REVERT-LABEL: @foo1( +; REVERT-NOT: vector.body +; REVERT: for.body +; REVERT: load float +; REVERT: fmul float +; REVERT: store float +; REVERT: load float +; REVERT: fadd float +; REVERT: store float +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + %0 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %0, 2 + store i32 %sub, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp sgt i32 %1, -1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %3 = load float, float* %arrayidx, align 4 + %mul = fmul float %3, 1.000000e+01 + %4 = load i32, i32* %i, align 4 + %add = add nsw i32 %4, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %5 = load i32, i32* %i, align 4 + %idxprom3 = sext i32 %5 to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %6, 1.000000e+00 + %7 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %7 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %8 = load i32, i32* %i, align 4 + %dec = add nsw i32 %8, -1 + store i32 %dec, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @foo2(i32 %LEN) #0 { +; CHECK-LABEL: @foo2( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; REVERT-LABEL: @foo2( +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 2 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %3 = load float, float* %arrayidx, align 4 + %mul = fmul float %3, 1.000000e+01 + %4 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %4 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %5 = load i32, i32* %i, align 4 + %add = add nsw i32 %5, 2 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %6, 1.000000e+00 + %7 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %7 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %8 = load i32, i32* %i, align 4 + %inc = add nsw i32 %8, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @foo3(i32 %LEN) #0 { +; CHECK-LABEL: @foo3( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 1 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %3 = load float, float* %arrayidx, align 4 + %mul = fmul float %3, 1.000000e+01 + %4 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %4 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %5 = load i32, i32* %i, align 4 + %idxprom3 = sext i32 %5 to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %mul5 = fmul float %6, 1.100000e+01 + %7 = load i32, i32* %i, align 4 + %sub6 = sub nsw i32 %7, 1 + %idxprom7 = sext i32 %sub6 to i64 + %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom7 + store float %mul5, float* %arrayidx8, align 4 + %8 = load i32, i32* %i, align 4 + %add = add nsw i32 %8, 1 + %idxprom9 = sext i32 %add to i64 + %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom9 + %9 = load float, float* %arrayidx10, align 4 + %add11 = fadd float %9, 1.000000e+00 + %10 = load i32, i32* %i, align 4 + %idxprom12 = sext i32 %10 to i64 + %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom12 + store float %add11, float* %arrayidx13, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32, i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @foo4(i32 %LEN) #0 { +; CHECK-LABEL: @foo4( +; CHECK: vector.body +; REVERT-LABEL: @foo4( +; REVERT-NOT: vector.body +; REVERT: for.body +; REVERT: if.then +; REVERT: load +; REVERT: fmul +; REVERT: store +; REVERT: load +; REVERT: fadd +; REVERT: if.else +; REVERT: load +; REVERT: fadd +; REVERT: store +; REVERT: load +; REVERT: fmul +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 2 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %tobool = icmp ne i32 %rem, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %for.body + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %4 = load float, float* %arrayidx, align 4 + %mul = fmul float %4, 1.000000e+01 + %5 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %6 = load i32, i32* %i, align 4 + %add = add nsw i32 %6, 2 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %7 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %7, 1.000000e+00 + %8 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %8 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + br label %if.end + +if.else: ; preds = %for.body + %9 = load i32, i32* %i, align 4 + %idxprom8 = sext i32 %9 to i64 + %arrayidx9 = getelementptr inbounds [1024 x float], [1024 x float]* @e, i64 0, i64 %idxprom8 + %10 = load float, float* %arrayidx9, align 4 + %add10 = fadd float %10, 1.100000e+01 + %11 = load i32, i32* %i, align 4 + %idxprom11 = sext i32 %11 to i64 + %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom11 + store float %add10, float* %arrayidx12, align 4 + %12 = load i32, i32* %i, align 4 + %add13 = add nsw i32 %12, 2 + %idxprom14 = sext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom14 + %13 = load float, float* %arrayidx15, align 4 + %mul16 = fmul float %13, 2.000000e+00 + %14 = load i32, i32* %i, align 4 + %idxprom17 = sext i32 %14 to i64 + %arrayidx18 = getelementptr inbounds [1024 x float], [1024 x float]* @f, i64 0, i64 %idxprom17 + store float %mul16, float* %arrayidx18, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %for.inc + +for.inc: ; preds = %if.end + %15 = load i32, i32* %i, align 4 + %inc = add nsw i32 %15, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @foo5(i32 %LEN) #0 { +; CHECK-LABEL: @foo5( +; CHECK: vector.body +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> +; CHECK: fadd <4 x float> +; CHECK: store <4 x float> +; REVERT-LABEL: @foo5( +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 1 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %3 = load float, float* %arrayidx, align 4 + %mul = fmul float %3, 1.000000e+01 + %4 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %4 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + %5 = load i32, i32* %i, align 4 + %add = add nsw i32 %5, 1 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %6, 1.000000e+00 + %7 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %7 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + %8 = load i32, i32* %i, align 4 + %add8 = add nsw i32 %8, 1 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom9 + %9 = load float, float* %arrayidx10, align 4 + %sub11 = fsub float %9, 2.000000e+00 + %10 = load i32, i32* %i, align 4 + %idxprom12 = sext i32 %10 to i64 + %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom12 + store float %sub11, float* %arrayidx13, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32, i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define dso_local void @bar(i32 %LEN) #0 { +; CHECK-LABEL: @bar( +; CHECK-NOT: vector.body +; CHECK: for.body +; CHECK: if.then +; CHECK: load float +; CHECK: fmul float +; CHECK: store float +; CHECK: if.end +; CHECK: load float +; CHECK: fadd float +; CHECK: store float +entry: + %LEN.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %LEN, i32* %LEN.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %LEN.addr, align 4 + %sub = sub nsw i32 %1, 1 + %cmp = icmp slt i32 %0, %sub + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %tobool = icmp ne i32 %rem, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %for.body + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom + %4 = load float, float* %arrayidx, align 4 + %mul = fmul float %4, 1.000000e+01 + %5 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1 + store float %mul, float* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %6 = load i32, i32* %i, align 4 + %add = add nsw i32 %6, 1 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3 + %7 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %7, 1.000000e+00 + %8 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %8 to i64 + %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6 + store float %add5, float* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +}