Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -83,6 +83,11 @@
                                  VectorType *SubTp,
                                  ArrayRef<const Value *> Args = None);
 
+  InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
+                                  MaybeAlign Alignment, unsigned AddressSpace,
+                                  TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr);
+
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include <cmath>
 using namespace llvm;
@@ -180,8 +181,11 @@
   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
     return getSpliceCost(Tp, Index);
 
+  // TODO: Add Cost Model for SK_PermuteSingleSrc/SK_Select
+  // TODO: For a fixed-width vector, add Cost Model for SK_Reverse/SK_Broadcast
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-  if (Kind == TTI::SK_Broadcast && isa<ScalableVectorType>(Tp))
+  if ((Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse) &&
+      isa<ScalableVectorType>(Tp))
     return LT.first * 1;
 
   return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
@@ -194,7 +198,6 @@
   if (!isa<ScalableVectorType>(Src))
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
-
   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
 }
 
@@ -224,6 +227,32 @@
   return NumLoads * MemOpCost;
 }
 
+InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
+                                              MaybeAlign Alignment,
+                                              unsigned AddressSpace,
+                                              TTI::TargetCostKind CostKind,
+                                              const Instruction *I) {
+  EVT VT = TLI->getValueType(DL, Ty, true);
+  // Type legalization can't handle structs
+  if (VT == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+
+  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
+  if (!LT.first.isValid())
+    return InstructionCost::getInvalid();
+
+  // TODO: consider latency as well for TCK_SizeAndLatency.
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+    return LT.first;
+
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
+  // TODO: Check truncation stores and extending loads
+  return LT.first;
+}
+
 InstructionCost
 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                     TTI::TargetCostKind CostKind) {
Index: llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
===================================================================
--- llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -55,20 +55,20 @@
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple riscv64-linux-gnu \
+; RUN:   -mattr=+v -debug-only=loop-vectorize \
+; RUN:   -riscv-v-vector-bits-min=128 -S < %s 2>&1 | FileCheck %s
+
+define dso_local void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @vector_reverse_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 4
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP0]], -4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[DOTNEG]], [[TMP12]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP11]], 4
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END5:%.*]] = sub i32 [[N]], [[CAST_VTC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP17]], -1
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[N]]
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[DOTNEG10:%.*]] = mul i32 [[TMP22]], -4
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[DOTNEG10]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP24]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP25]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i32 [[TMP26]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 1, [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP32]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, ptr [[TMP33]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[DOTNEG11:%.*]] = mul i32 [[TMP36]], -4
+; CHECK-NEXT:    [[TMP37:%.*]] = or i32 [[DOTNEG11]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP38]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP35]], ptr [[TMP39]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP40:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP42:%.*]] = shl i32 [[TMP41]], 2
+; CHECK-NEXT:    [[TMP43:%.*]] = sub i32 0, [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sub i32 1, [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP43]] to i64
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP44]] to i64
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP47]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP40]], ptr [[TMP48]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP50:%.*]] = shl i64 [[TMP49]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_SCEVCHECK]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[I_0:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add i32 [[TMP52]], 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+  %i.0 = add nsw i32 %i.0.in8, -1
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+  %1 = load i32, ptr %arrayidx, align 4
+  %add9 = add i32 %1, 1
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+  store i32 %add9, ptr %arrayidx3, align 4
+  %cmp = icmp ugt i64 %indvars.iv, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+; Function Attrs: argmemonly nofree norecurse nosync nounwind
+define dso_local void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @vector_reverse_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 4
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP0]], -4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[DOTNEG]], [[TMP12]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP11]], 4
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END5:%.*]] = sub i32 [[N]], [[CAST_VTC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP17]], -1
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[N]]
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[DOTNEG10:%.*]] = mul i32 [[TMP22]], -4
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[DOTNEG10]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP24]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP25]], align 4, !alias.scope !9
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i32 [[TMP26]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 1, [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP32]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x float>, ptr [[TMP33]], align 4, !alias.scope !9
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[DOTNEG11:%.*]] = mul i32 [[TMP36]], -4
+; CHECK-NEXT:    [[TMP37:%.*]] = or i32 [[DOTNEG11]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP38]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP35]], ptr [[TMP39]], align 4, !alias.scope !12, !noalias !9
+; CHECK-NEXT:    [[TMP40:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP42:%.*]] = shl i32 [[TMP41]], 2
+; CHECK-NEXT:    [[TMP43:%.*]] = sub i32 0, [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sub i32 1, [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP43]] to i64
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP44]] to i64
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP47]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP40]], ptr [[TMP48]], align 4, !alias.scope !12, !noalias !9
+; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP50:%.*]] = shl i64 [[TMP49]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_SCEVCHECK]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[I_0:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV1:%.*]] = fadd float [[TMP52]], 1.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+  %i.0 = add nsw i32 %i.0.in8, -1
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+  %1 = load float, ptr %arrayidx, align 4
+  %conv1 = fadd float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+  store float %conv1, ptr %arrayidx3, align 4
+  %cmp = icmp ugt i64 %indvars.iv, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+a,+c,+m,+relax,-save-restore" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}