diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2530,6 +2530,14 @@ return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name); } + Value *CreateVectorReverse(Value *V, const Twine &Name = "") { + auto Ty = cast(V->getType()); + Module *M = BB->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reverse, Ty); + return Insert(CallInst::Create(F, V), Name); + } + Value *CreateExtractValue(Value *Agg, ArrayRef Idxs, const Twine &Name = "") { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -448,6 +448,13 @@ SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost); + // Avoid to call BasicTTImpl for scalable vector and then assert + // ATM in the BasicTTImpl the cost for ISD that accepts scalabe + // vector is 0 + // but not all of them accept scalable vector. + if (isa(Src)) + return 0; + return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } @@ -1178,5 +1185,12 @@ return LT.first * Entry->Cost; } + // Scalable vector type use the new intrinsic vector_reverse + // for TTI::SK_Reverse. Fixed vector types keep using shuffle vector + if (Kind == TTI::SK_Reverse && isa(Tp)) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + return LT.first + 1; + } + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2504,7 +2504,12 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); + + // Call the instrinsic vector_reverse for scalable vector + if (VF.isScalable()) + return Builder.CreateVectorReverse(Vec, "reverse"); + + // Keep the original behaviour for fixed vector SmallVector ShuffleMask; for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); @@ -2792,16 +2797,16 @@ InBounds = gep->isInBounds(); if (Reverse) { - assert(!VF.isScalable() && - "Reversing vectors is not yet supported for scalable vectors."); - + unsigned NumElts = VF.isScalable() + ? VectorLoopValueMap.getNumCachedLanes() + : VF.getKnownMinValue(); // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); + ScalarDataTy, Ptr, Builder.getInt32(-Part * NumElts))); PartPtr->setIsInBounds(InBounds); PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); + ScalarDataTy, PartPtr, Builder.getInt32(1 - NumElts))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); @@ -7315,8 +7320,6 @@ smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } - - assert(!VF.isScalable() && "VF is assumed to be non scalable"); unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -0,0 +1,104 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; This is the loop in c++ being vectorize in this file with +;experimental.vector.reverse +; #pragma clang loop vectorize_width(4, scalable) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +define void @vecReverse_vla(i32 %N, double* nocapture %a, double* nocapture readonly %b) #0{ +; CHECK-LABEL: @vecReverse_vla +; CHECK: %[[WIDE:.*]] = load , * %{{.*}}, align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[WIDE]]) +; CHECK-NEXT: %[[FADD:.*]] = fadd %[[REVERSE]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %{{.*}} = getelementptr inbounds double, double* %{{.*}}, i64 -8 +; CHECK-NEXT: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv8f64( %[[FADD]]) +; CHECK-NEXT: %{{.*}} = getelementptr inbounds double, double* %{{.*}}, i64 %{{.*}} +; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %{{.*}} to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[CAST]], align 8 +; CHECK-NEXT: %{{.*}} = call i64 @llvm.vscale.i64() + + +entry: + %cmp8 = icmp sgt i32 %N, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup.loopexit + +for.body.preheader: ; preds = %entry + %0 = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + ret void + + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv.next + %1 = load double, double* %arrayidx, align 8 + %add = fadd double %1, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv.next + store double %add, double* %arrayidx2, align 8 + %cmp = icmp sgt i64 %indvars.iv, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +} + + +; Function Attrs: nofree norecurse nounwind uwtable mustprogress +define void @vecReverse_i64(i32 %N, i64* nocapture %a, i64* nocapture readonly %b) #0 { +; CHECK-LABEL: @vecReverse_i64 +; CHECK: %[[WIDE:.*]] = load , * %{{.*}}, align 8 +; CHECK-NEXT: %[[REVERSE:.*]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[WIDE]]) +; CHECK-NEXT: %[[CAST:.*]] = sitofp %[[REVERSE]] to +; CHECK-NEXT: %{{.*}} = fadd %[[CAST]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[CAST2:.*]] = fptosi %{{.*}} to +; CHECK-NEXT: %[[GETPTR:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i64 -8 +; CHECK-NEXT: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv8i64( %[[CAST2]]) +; CHECK-NEXT: %{{.*}} = getelementptr inbounds i64, i64* %[[GETPTR]], i64 %{{.*}} +; CHECK-NEXT: %[[BITCAST:.*]] = bitcast i64* %{{.*}} to * +; CHECK-NEXT: store %[[REVERSE6]], * %[[BITCAST]], align 8 +; CHECK-NEXT: %{{.*}} = call i64 @llvm.vscale.i64() + +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup.loopexit + +for.body.preheader: ; preds = %entry + %0 = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv.next + %1 = load i64, i64* %arrayidx, align 8 + %conv = sitofp i64 %1 to double + %add = fadd double %conv, 1.000000e+00 + %conv1 = fptosi double %add to i64 + %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv.next + store i64 %conv1, i64* %arrayidx3, align 8 + %cmp = icmp sgt i64 %indvars.iv, 1 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} + + +