diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2530,6 +2530,23 @@
     return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
   }
 
+  Value *CreateVectorReverse(Value *V, const Twine &Name = "") {
+    auto Ty = cast<VectorType>(V->getType());
+    if (isa<ScalableVectorType>(Ty)) {
+      Module *M = BB->getParent()->getParent();
+      Function *F = Intrinsic::getDeclaration(
+          M, Intrinsic::experimental_vector_reverse, Ty);
+      return Insert(CallInst::Create(F, V), Name);
+    }
+    // Keep the original behaviour for fixed vector
+    SmallVector<int, 8> ShuffleMask;
+    int NumElts = Ty->getElementCount().getKnownMinValue();
+    for (int i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(NumElts - i - 1);
+
+    return CreateShuffleVector(V, ShuffleMask, Name);
+  }
+
   Value *CreateExtractValue(Value *Agg,
                             ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2504,12 +2504,7 @@
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
-  assert(!VF.isScalable() && "Cannot reverse scalable vectors");
-  SmallVector<int, 8> ShuffleMask;
-  for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
-    ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
-
-  return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
+  return Builder.CreateVectorReverse(Vec, "reverse");
 }
 
 // Return whether we allow using masked interleave-groups (for dealing with
@@ -2792,17 +2787,33 @@
       InBounds = gep->isInBounds();
 
     if (Reverse) {
-      assert(!VF.isScalable() &&
-             "Reversing vectors is not yet supported for scalable vectors.");
-
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
-      PartPtr->setIsInBounds(InBounds);
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
-      PartPtr->setIsInBounds(InBounds);
+      if (VF.isScalable()) {
+        // NumElt = -Part * (/*getRunTimeVF*/ VScale * VF.getKnownMinValue())
+        Value *NumElt =
+            Builder.CreateMul(Builder.getInt32(-Part),
+                              getRuntimeVF(Builder, Builder.getInt32Ty(), VF));
+        PartPtr = cast<GetElementPtrInst>(
+            Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
+        PartPtr->setIsInBounds(InBounds);
+        // LastLane = 1 - (/*getRunTimeVF*/ VScale * VF.getKnownMinValue())
+        Value *LastLane =
+            Builder.CreateSub(Builder.getInt32(1),
+                              getRuntimeVF(Builder, Builder.getInt32Ty(), VF));
+        PartPtr = cast<GetElementPtrInst>(
+            Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
+        PartPtr->setIsInBounds(InBounds);
+      } else {
+        PartPtr = cast<GetElementPtrInst>(
+            Builder.CreateGEP(ScalarDataTy, Ptr,
+                              Builder.getInt32(-Part * VF.getKnownMinValue())));
+        PartPtr->setIsInBounds(InBounds);
+        PartPtr = cast<GetElementPtrInst>(
+            Builder.CreateGEP(ScalarDataTy, PartPtr,
+                              Builder.getInt32(1 - VF.getKnownMinValue())));
+        PartPtr->setIsInBounds(InBounds);
+      }
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
@@ -7315,8 +7326,6 @@
             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       }
     }
-
-    assert(!VF.isScalable() && "VF is assumed to be non scalable");
     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N *
            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -0,0 +1,120 @@
+; This is the loop in c++ being vectorize in this file with
+;experimental.vector.reverse
+;  #pragma clang loop vectorize_width(8, scalable)
+;  for (int i = N-1; i >= 0; --i)
+;    a[i] = b[i] + 1.0;
+
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define void @vector_reverse_double(i64 %N, double* %a, double* %b) #0{
+; CHECK-LABEL: @vector_reverse_double
+; CHECK-LABEL: vector.body:
+; CHECK: %[[ZEXT:.*]] = zext i32 %{{.*}} to i64
+; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds double, double* %b, i64 %[[ZEXT]]
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1
+; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i64 %[[SEXT]]
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <vscale x 8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <vscale x 8 x double>, <vscale x 8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %[[WIDE]])
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 8 x double> %[[REVERSE]], shufflevector
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* %a, i64 %[[ZEXT]]
+; CHECK-NEXT: %[[REVERSE6:.*]] = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %[[FADD]])
+; CHECK-NEXT: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], -8
+; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1
+; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i64 %[[SEXT1]]
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast double* %[[GEP3]] to <vscale x 8 x double>*
+; CHECK-NEXT: store <vscale x 8 x double> %[[REVERSE6]], <vscale x 8 x double>* %[[CAST1]], align 8
+
+entry:
+  %0 = trunc i64 %N to i32
+  %i.08 = add i32 %0, -1
+  %cmp9 = icmp sgt i32 %i.08, -1
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup.loopexit
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.010 = phi i32 [ %i.0, %for.body ], [ %i.08, %for.body.preheader ]
+  %idxprom7 = zext i32 %i.010 to i64
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %idxprom7
+  %1 = load double, double* %arrayidx, align 8
+  %add = fadd double %1, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %idxprom7
+  store double %add, double* %arrayidx2, align 8
+  %i.0 = add nsw i32 %i.010, -1
+  %cmp = icmp sgt i32 %i.010, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+}
+
+
+define void @vector_reverse_longint(i64 %N, i64* %a, i64* %b) #0 {
+; CHECK-LABEL: vector_reverse_longint
+; CHECK-LABEL: vector.body:
+; CHECK: %[[ZEXT:.*]] = zext i32 %{{.*}} to i64
+; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[ZEXT]]
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1
+; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i64 %[[SEXT]]
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <vscale x 8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %[[WIDE]])
+; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* %a, i64 %[[ZEXT]]
+; CHECK-NEXT: %[[REVERSE6]] = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %[[ADD]])
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1
+; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i64 %[[SEXT1]]
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP3]] to <vscale x 8 x i64>*
+; CHECK-NEXT:  store <vscale x 8 x i64> %[[REVERSE6]], <vscale x 8 x i64>* %[[CAST1]], align 8
+
+entry:
+  %0 = trunc i64 %N to i32
+  %i.010 = add i32 %0, -1
+  %cmp11 = icmp sgt i32 %i.010, -1
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup.loopexit
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.012 = phi i32 [ %i.0, %for.body ], [ %i.010, %for.body.preheader ]
+  %idxprom9 = zext i32 %i.012 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %idxprom9
+  %1 = load i64, i64* %arrayidx, align 8
+  %add = add i64 %1, 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 %idxprom9
+  store i64 %add, i64* %arrayidx4, align 8
+  %i.0 = add nsw i32 %i.012, -1
+  %cmp = icmp sgt i32 %i.012, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 8}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
@@ -0,0 +1,103 @@
+; Test VLA for reverse with fixed size vector
+; This is the loop in c++ being vectorize in this file with
+; shuffle reverse
+;  #pragma clang loop vectorize_width(8, fixed)
+;  for (int i = N-1; i >= 0; --i)
+;    a[i] = b[i] + 1.0;
+
+; RUN: opt -loop-vectorize -dce  -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$
+; WARN-NOT: warning
+
+define void @vector_reverse_double(i64 %N, double* %a, double* %b) #0 {
+; CHECK-LABEL: vector_reverse_double
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]]
+; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]]
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>*
+; CHECK-NEXT:  store <8 x double> %reverse6, <8 x double>* %[[CAST]], align 8
+
+entry:
+  %0 = trunc i64 %N to i32
+  %i.08 = add i32 %0, -1
+  %cmp9 = icmp sgt i32 %i.08, -1
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.010 = phi i32 [ %i.0, %for.body ], [ %i.08, %for.body.preheader ]
+  %idxprom7 = zext i32 %i.010 to i64
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %idxprom7
+  %1 = load double, double* %arrayidx, align 8
+  %add = fadd double %1, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %idxprom7
+  store double %add, double* %arrayidx2, align 8
+  %i.0 = add nsw i32 %i.010, -1
+  %cmp = icmp sgt i32 %i.010, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+define void @vector_reverse_longint(i64 %N, i64* %a, i64* %b) #0 {
+; CHECK-LABEL: vector_reverse_longint
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]]
+; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]]
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>*
+; CHECK-NEXT:  store <8 x i64> %reverse6, <8 x i64>* %[[CAST1]], align 8
+
+entry:
+  %0 = trunc i64 %N to i32
+  %i.010 = add i32 %0, -1
+  %cmp11 = icmp sgt i32 %i.010, -1
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.012 = phi i32 [ %i.0, %for.body ], [ %i.010, %for.body.preheader ]
+  %idxprom9 = zext i32 %i.012 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %idxprom9
+  %1 = load i64, i64* %arrayidx, align 8
+  %add = add i64 %1, 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 %idxprom9
+  store i64 %add, i64* %arrayidx4, align 8
+  %i.0 = add nsw i32 %i.012, -1
+  %cmp = icmp sgt i32 %i.012, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 8}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}