diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2483,6 +2483,9 @@ /// address space before call and casted back to Ptr type after call. Value *CreateStripInvariantGroup(Value *Ptr); + /// Return a vector value that contains the vector V reversed + Value *CreateVectorReverse(Value *V, const Twine &Name = ""); + /// Return a vector value that contains \arg V broadcasted to \p /// NumElts elements. Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = ""); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -992,6 +992,22 @@ return Fn; } +Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) { + auto Ty = cast(V->getType()); + if (isa(Ty)) { + Module *M = BB->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reverse, Ty); + return Insert(CallInst::Create(F, V), Name); + } + // Keep the original behaviour for fixed vector + SmallVector ShuffleMask; + int NumElts = Ty->getElementCount().getKnownMinValue(); + for (int i = 0; i < NumElts; ++i) + ShuffleMask.push_back(NumElts - i - 1); + return CreateShuffleVector(V, ShuffleMask, Name); +} + Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name) { auto EC = ElementCount::getFixed(NumElts); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2568,12 +2568,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); - SmallVector ShuffleMask; - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - - return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); + return Builder.CreateVectorReverse(Vec, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2854,18 +2849,21 @@ bool InBounds = false; if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) InBounds = gep->isInBounds(); - if (Reverse) { - assert(!VF.isScalable() && - "Reversing vectors is not yet supported for scalable vectors."); - // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); PartPtr->setIsInBounds(InBounds); - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -0,0 +1,68 @@ +; This is the loop in c++ being vectorize in this file with +; experimental.vector.reverse + +;#pragma clang loop vectorize_width(4, scalable) +; for (long int i = N - 1; i >= 0; i--) +; { +; if (cond[i]) +; a[i] += 1; +; } + +; The test checks if the mask is being correctly created, reverted and used + +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 { +; CHECK-LABEL: vector.body: +; CHECK: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) +; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* nonnull %{{.*}}, i32 8, %[[REVERSE6]], poison) +; CHECK-NEXT: %[[REVERSE7:.*]] = call @llvm.experimental.vector.reverse.nxv4f64( %[[WIDEMSKLOAD]]) +; CHECK-NEXT: %[[FADD:.*]] = fadd %[[REVERSE7]] +; CHECK-NEXT: %[[REVERSE8:.*]] = call @llvm.experimental.vector.reverse.nxv4f64( %[[FADD]]) +; CHECK: %[[REVERSE9:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) +; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64( %[[REVERSE8]], * %{{.*}}, i32 8, %[[REVERSE9]] + +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %for.body, %entry + %i.08.in = phi i64 [ %i.08, %for.inc ], [ %N, %entry ] + %i.08 = add nsw i64 %i.08.in, -1 + %arrayidx = getelementptr inbounds double, double* %cond, i64 %i.08 + %0 = load double, double* %arrayidx, align 8 + %tobool = fcmp une double %0, 0.000000e+00 + br i1 %tobool, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08 + %1 = load double, double* %arrayidx1, align 8 + %add = fadd double %1, 1.000000e+00 + store double %add, double* %arrayidx1, align 8 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %cmp = icmp sgt i64 %i.08.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"} + + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -0,0 +1,108 @@ +; This is the loop in c++ being vectorize in this file with +;experimental.vector.reverse +; #pragma clang loop vectorize_width(8, scalable) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ +; CHECK-LABEL: @vector_reverse_f64 +; CHECK-LABEL: vector.body: +; CHECK: %[[ADD:.*]] = add i64 %{{.*}}, %N +; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds double, double* %b, i64 %[[ADD]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1 +; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i64 %[[SEXT]] +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to * +; CHECK-NEXT: %[[WIDE:.*]] = load , * %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[WIDE]]) +; CHECK-NEXT: %[[FADD:.*]] = fadd %[[REVERSE]], shufflevector +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* %a, i64 %[[ADD]] +; CHECK-NEXT: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[FADD]]) +; CHECK-NEXT: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], -8 +; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1 +; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64 +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i64 %[[SEXT1]] +; CHECK-NEXT: %[[CAST1:.*]] = bitcast double* %[[GEP3]] to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[CAST1]], align 8 + +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.08.in = phi i64 [ %i.08, %for.body ], [ %N, %entry ] + %i.08 = add nsw i64 %i.08.in, -1 + %arrayidx = getelementptr inbounds double, double* %b, i64 %i.08 + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08 + store double %add, double* %arrayidx1, align 8 + %cmp = icmp sgt i64 %i.08.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + + +define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 { +; CHECK-LABEL: vector_reverse_i64 +; CHECK-LABEL: vector.body: +; CHECK: %[[ADD:.*]] = add i64 %{{.*}}, %N +; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[ADD]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1 +; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i64 %[[SEXT]] +; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to * +; CHECK-NEXT: %[[WIDE:.*]] = load , * %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[WIDE]]) +; CHECK-NEXT: %[[ADD1:.*]] = add %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* %a, i64 %[[ADD]] +; CHECK-NEXT: %[[REVERSE6]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[ADD1]]) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1 +; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64 +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i64 %[[SEXT1]] +; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP3]] to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[CAST1]], align 8 + +entry: + %cmp8 = icmp sgt i64 %N, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.09.in = phi i64 [ %i.09, %for.body ], [ %N, %entry ] + %i.09 = add nsw i64 %i.09.in, -1 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.09 + %0 = load i64, i64* %arrayidx, align 8 + %add = add i64 %0, 1 + %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.09 + store i64 %add, i64* %arrayidx2, align 8 + %cmp = icmp sgt i64 %i.09.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -0,0 +1,65 @@ +; This is the loop in c++ being vectorize in this file with +; shuffle reverse + +;#pragma clang loop vectorize_width(4, fixed) +; for (long int i = N - 1; i >= 0; i--) +; { +; if (cond[i]) +; a[i] += 1; +; } + +; The test checks if the mask is being correctly created, reverted and used + +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 { +; CHECK-LABEL: vector.body: +; CHECK: %[[REVERSE6:.*]] = shufflevector <4 x i1> %{{.*}}, <4 x i1> poison, <4 x i32> +; CHECK: %[[WIDEMSKLOAD:.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull %{{.*}}, i32 8, <4 x i1> %[[REVERSE6]], <4 x double> poison) +; CHECK-NEXT: %[[FADD:.*]] = fadd <4 x double> %[[WIDEMSKLOAD]] +; CHECK: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %[[FADD]], <4 x double>* %{{.*}}, i32 8, <4 x i1> %[[REVERSE6]]) + +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %for.body, %entry + %i.08.in = phi i64 [ %i.08, %for.inc ], [ %N, %entry ] + %i.08 = add nsw i64 %i.08.in, -1 + %arrayidx = getelementptr inbounds double, double* %cond, i64 %i.08 + %0 = load double, double* %arrayidx, align 8 + %tobool = fcmp une double %0, 0.000000e+00 + br i1 %tobool, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08 + %1 = load double, double* %arrayidx1, align 8 + %add = fadd double %1, 1.000000e+00 + store double %add, double* %arrayidx1, align 8 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %cmp = icmp sgt i64 %i.08.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"} + + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -0,0 +1,91 @@ +; Test VLA for reverse with fixed size vector +; This is the loop in c++ being vectorize in this file with +; shuffle reverse +; #pragma clang loop vectorize_width(8, fixed) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$ +; WARN-NOT: warning + +define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 { +; CHECK-LABEL: vector_reverse_f64 +; CHECK-LABEL: vector.body +; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* +; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}} +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0 +; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>* +; CHECK-NEXT: store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8 + +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.08.in = phi i64 [ %i.08, %for.body ], [ %N, %entry ] + %i.08 = add nsw i64 %i.08.in, -1 + %arrayidx = getelementptr inbounds double, double* %b, i64 %i.08 + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08 + store double %add, double* %arrayidx1, align 8 + %cmp = icmp sgt i64 %i.08.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 { +; CHECK-LABEL: vector_reverse_i64 +; CHECK-LABEL: vector.body +; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* +; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}} +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0 +; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>* +; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8 + +entry: + %cmp8 = icmp sgt i64 %N, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.09.in = phi i64 [ %i.09, %for.body ], [ %N, %entry ] + %i.09 = add nsw i64 %i.09.in, -1 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.09 + %0 = load i64, i64* %arrayidx, align 8 + %add = add i64 %0, 1 + %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.09 + store i64 %add, i64* %arrayidx2, align 8 + %cmp = icmp sgt i64 %i.09.in, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}