diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2530,6 +2530,23 @@ return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name); } + Value *CreateVectorReverse(Value *V, const Twine &Name = "") { + auto Ty = cast(V->getType()); + if (isa(Ty)) { + Module *M = BB->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reverse, Ty); + return Insert(CallInst::Create(F, V), Name); + } + // Keep the original behaviour for fixed vector + SmallVector ShuffleMask; + int NumElts = Ty->getElementCount().getKnownMinValue(); + for (int i = 0; i < NumElts; ++i) + ShuffleMask.push_back(NumElts - i - 1); + + return CreateShuffleVector(V, ShuffleMask, Name); + } + Value *CreateExtractValue(Value *Agg, ArrayRef Idxs, const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2504,12 +2504,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); - SmallVector ShuffleMask; - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - - return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); + return Builder.CreateVectorReverse(Vec, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2792,17 +2787,33 @@ InBounds = gep->isInBounds(); if (Reverse) { - assert(!VF.isScalable() && - "Reversing vectors is not yet supported for scalable vectors."); - // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); - PartPtr->setIsInBounds(InBounds); + if (VF.isScalable()) { + // NumElt = -Part * (/*getRunTimeVF*/ VScale * VF.getKnownMinValue()) + Value *NumElt = + Builder.CreateMul(Builder.getInt32(-Part), + getRuntimeVF(Builder, Builder.getInt32Ty(), VF)); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); + PartPtr->setIsInBounds(InBounds); + // LastLane = 1 - (/*getRunTimeVF*/ VScale * VF.getKnownMinValue()) + Value *LastLane = + Builder.CreateSub(Builder.getInt32(1), + getRuntimeVF(Builder, Builder.getInt32Ty(), VF)); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); + PartPtr->setIsInBounds(InBounds); + } else { + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, + Builder.getInt32(-Part * VF.getKnownMinValue()))); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, PartPtr, + Builder.getInt32(1 - VF.getKnownMinValue()))); + PartPtr->setIsInBounds(InBounds); + } if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { @@ -7315,8 +7326,6 @@ smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } - - assert(!VF.isScalable() && "VF is assumed to be non scalable"); unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -0,0 +1,120 @@ +; This is the loop in c++ being vectorize in this file with +;experimental.vector.reverse +; #pragma clang loop vectorize_width(8, scalable) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @vector_reverse_double(i64 %N, double* %a, double* %b) #0{ +; CHECK-LABEL: @vector_reverse_double +; CHECK-LABEL: vector.body: +; CHECK: %[[ZEXT:.*]] = zext i32 %{{.*}} to i64 +; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds double, double* %b, i64 %[[ZEXT]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1 +; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i64 %[[SEXT]] +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to * +; CHECK-NEXT: %[[WIDE:.*]] = load , * %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[WIDE]]) +; CHECK-NEXT: %[[FADD:.*]] = fadd %[[REVERSE]], shufflevector +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* %a, i64 %[[ZEXT]] +; CHECK-NEXT: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[FADD]]) +; CHECK-NEXT: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], -8 +; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1 +; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64 +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i64 %[[SEXT1]] +; CHECK-NEXT: %[[CAST1:.*]] = bitcast double* %[[GEP3]] to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[CAST1]], align 8 + +entry: + %0 = trunc i64 %N to i32 + %i.08 = add i32 %0, -1 + %cmp9 = icmp sgt i32 %i.08, -1 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup.loopexit + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.010 = phi i32 [ %i.0, %for.body ], [ %i.08, %for.body.preheader ] + %idxprom7 = zext i32 %i.010 to i64 + %arrayidx = getelementptr inbounds double, double* %b, i64 %idxprom7 + %1 = load double, double* %arrayidx, align 8 + %add = fadd double %1, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %idxprom7 + store double %add, double* %arrayidx2, align 8 + %i.0 = add nsw i32 %i.010, -1 + %cmp = icmp sgt i32 %i.010, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +} + + +define void @vector_reverse_longint(i64 %N, i64* %a, i64* %b) #0 { +; CHECK-LABEL: vector_reverse_longint +; CHECK-LABEL: vector.body: +; CHECK: %[[ZEXT:.*]] = zext i32 %{{.*}} to i64 +; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[ZEXT]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1 +; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i64 %[[SEXT]] +; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to * +; CHECK-NEXT: %[[WIDE:.*]] = load , * %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[WIDE]]) +; CHECK-NEXT: %[[ADD:.*]] = add %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* %a, i64 %[[ZEXT]] +; CHECK-NEXT: %[[REVERSE6]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[ADD]]) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE]], -8 +; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1 +; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64 +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i64 %[[SEXT1]] +; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP3]] to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[CAST1]], align 8 + +entry: + %0 = trunc i64 %N to i32 + %i.010 = add i32 %0, -1 + %cmp11 = icmp sgt i32 %i.010, -1 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup.loopexit + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.012 = phi i32 [ %i.0, %for.body ], [ %i.010, %for.body.preheader ] + %idxprom9 = zext i32 %i.012 to i64 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %idxprom9 + %1 = load i64, i64* %arrayidx, align 8 + %add = add i64 %1, 1 + %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 %idxprom9 + store i64 %add, i64* %arrayidx4, align 8 + %i.0 = add nsw i32 %i.012, -1 + %cmp = icmp sgt i32 %i.012, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -0,0 +1,103 @@ +; Test VLA for reverse with fixed size vector +; This is the loop in c++ being vectorize in this file with +; shuffle reverse +; #pragma clang loop vectorize_width(8, fixed) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$ +; WARN-NOT: warning + +define void @vector_reverse_double(i64 %N, double* %a, double* %b) #0 { +; CHECK-LABEL: vector_reverse_double +; CHECK-LABEL: vector.body +; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* +; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]] +; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}} +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]] +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0 +; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>* +; CHECK-NEXT: store <8 x double> %reverse6, <8 x double>* %[[CAST]], align 8 + +entry: + %0 = trunc i64 %N to i32 + %i.08 = add i32 %0, -1 + %cmp9 = icmp sgt i32 %i.08, -1 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.010 = phi i32 [ %i.0, %for.body ], [ %i.08, %for.body.preheader ] + %idxprom7 = zext i32 %i.010 to i64 + %arrayidx = getelementptr inbounds double, double* %b, i64 %idxprom7 + %1 = load double, double* %arrayidx, align 8 + %add = fadd double %1, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %idxprom7 + store double %add, double* %arrayidx2, align 8 + %i.0 = add nsw i32 %i.010, -1 + %cmp = icmp sgt i32 %i.010, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +define void @vector_reverse_longint(i64 %N, i64* %a, i64* %b) #0 { +; CHECK-LABEL: vector_reverse_longint +; CHECK-LABEL: vector.body +; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0 +; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7 +; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* +; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]] +; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]] +; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}} +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]] +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0 +; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>* +; CHECK-NEXT: store <8 x i64> %reverse6, <8 x i64>* %[[CAST1]], align 8 + +entry: + %0 = trunc i64 %N to i32 + %i.010 = add i32 %0, -1 + %cmp11 = icmp sgt i32 %i.010, -1 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.012 = phi i32 [ %i.0, %for.body ], [ %i.010, %for.body.preheader ] + %idxprom9 = zext i32 %i.012 to i64 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %idxprom9 + %1 = load i64, i64* %arrayidx, align 8 + %add = add i64 %1, 1 + %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 %idxprom9 + store i64 %add, i64* %arrayidx4, align 8 + %i.0 = add nsw i32 %i.012, -1 + %cmp = icmp sgt i32 %i.012, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}