diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1245,4 +1245,15 @@
     defm vsuxseg # nf : RISCVISegStore<nf>;
   }
 
+  // Strided loads/stores for fixed vectors.
+  def int_riscv_masked_strided_load
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>;
+  def int_riscv_masked_strided_store
+        : Intrinsic<[],
+                    [llvm_anyvector_ty, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>;
 } // TargetPrefix = "riscv"
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -24,6 +24,7 @@
   RISCVExpandAtomicPseudoInsts.cpp
   RISCVExpandPseudoInsts.cpp
   RISCVFrameLowering.cpp
+  RISCVGatherScatterLowering.cpp
   RISCVInsertVSETVLI.cpp
   RISCVInstrInfo.cpp
   RISCVInstructionSelector.cpp
@@ -50,6 +51,7 @@
   SelectionDAG
   Support
   Target
+  TransformUtils
   GlobalISel
 
   ADD_TO_COMPONENT
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -37,6 +37,9 @@
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 
+FunctionPass *createRISCVGatherScatterLoweringPass();
+void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -0,0 +1,473 @@
+//===- RISCVGatherScatterLowering.cpp - Gather/Scatter lowering -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass custom lowers llvm.gather and llvm.scatter instructions to
+// RISCV intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-gather-scatter-lowering"
+
+namespace {
+
+class RISCVGatherScatterLowering : public FunctionPass {
+  const RISCVSubtarget *ST = nullptr;
+  const RISCVTargetLowering *TLI = nullptr;
+  LoopInfo *LI = nullptr;
+  const DataLayout *DL = nullptr;
+
+  SmallVector<WeakTrackingVH> MaybeDeadPHIs;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RISCVGatherScatterLowering() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override {
+    return "RISCV gather/scatter lowering";
+  }
+
+private:
+  bool isLegalTypeAndAlignment(Type *DataType, Value *AlignOp);
+
+  bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr,
+                                 Value *AlignOp);
+
+  std::pair<Value *, Value *> determineBaseAndStride(GetElementPtrInst *GEP,
+                                                     IRBuilder<> &Builder);
+
+  bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride,
+                              PHINode *&BasePtr, BinaryOperator *&Inc,
+                              IRBuilder<> &Builder);
+};
+
+} // end anonymous namespace
+
+char RISCVGatherScatterLowering::ID = 0;
+
+INITIALIZE_PASS(RISCVGatherScatterLowering, DEBUG_TYPE,
+                "RISCV gather/scatter lowering pass", false, false)
+
+FunctionPass *llvm::createRISCVGatherScatterLoweringPass() {
+  return new RISCVGatherScatterLowering();
+}
+
+bool RISCVGatherScatterLowering::isLegalTypeAndAlignment(Type *DataType,
+                                                         Value *AlignOp) {
+  Type *ScalarType = DataType->getScalarType();
+  if (!TLI->isLegalElementTypeForRVV(ScalarType))
+    return false;
+
+  MaybeAlign MA = cast<ConstantInt>(AlignOp)->getMaybeAlignValue();
+  if (MA && MA->value() < DL->getTypeStoreSize(ScalarType).getFixedSize())
+    return false;
+
+  // FIXME: Let the backend type legalize by splitting/widening?
+  EVT DataVT = TLI->getValueType(*DL, DataType);
+  if (!TLI->isTypeLegal(DataVT))
+    return false;
+
+  return true;
+}
+
+// TODO: Should we consider the mask when looking for a stride?
+static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) {
+  unsigned NumElts = cast<FixedVectorType>(StartC->getType())->getNumElements();
+
+  // Check that the start value is a strided constant.
+  auto *StartVal =
+      dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement((unsigned)0));
+  if (!StartVal)
+    return std::make_pair(nullptr, nullptr);
+  APInt StrideVal(StartVal->getValue().getBitWidth(), 0);
+  ConstantInt *Prev = StartVal;
+  for (unsigned i = 1; i != NumElts; ++i) {
+    auto *C = dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement(i));
+    if (!C)
+      return std::make_pair(nullptr, nullptr);
+
+    APInt LocalStride = C->getValue() - Prev->getValue();
+    if (i == 1)
+      StrideVal = LocalStride;
+    else if (StrideVal != LocalStride)
+      return std::make_pair(nullptr, nullptr);
+
+    Prev = C;
+  }
+
+  Value *Stride = ConstantInt::get(StartVal->getType(), StrideVal);
+
+  return std::make_pair(StartVal, Stride);
+}
+
+// Recursively, walk about the use-def chain until we find a Phi with a strided
+// start value. Build and update a scalar recurrence as we unwind the recursion.
+// We also update the Stride as we unwind. Our goal is to move all of the
+// arithmetic out of the loop.
+bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
+                                                        Value *&Stride,
+                                                        PHINode *&BasePtr,
+                                                        BinaryOperator *&Inc,
+                                                        IRBuilder<> &Builder) {
+  // Our base case is a Phi.
+  if (auto *Phi = dyn_cast<PHINode>(Index)) {
+    // A phi node we want to perform this function on should be from the
+    // loop header.
+    if (Phi->getParent() != L->getHeader())
+      return false;
+
+    Value *Step, *Start;
+    if (!matchSimpleRecurrence(Phi, Inc, Start, Step) ||
+        Inc->getOpcode() != Instruction::Add)
+      return false;
+    unsigned IncrementingBlock = Phi->getIncomingValue(0) == Inc ? 0 : 1;
+
+    // Only proceed if the step is loop invariant.
+    if (!L->isLoopInvariant(Step))
+      return false;
+
+    // Step should be a splat.
+    Step = getSplatValue(Step);
+    if (!Step)
+      return false;
+
+    // Start should be a strided constant.
+    auto *StartC = dyn_cast<Constant>(Start);
+    if (!StartC)
+      return false;
+
+    std::tie(Start, Stride) = matchStridedConstant(StartC);
+    if (!Start)
+      return false;
+    assert(Stride != nullptr);
+
+    // Build scalar phi and increment.
+    BasePtr =
+        PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi);
+    Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
+                                    Inc);
+    BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
+    BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));
+
+    // Note that this Phi might be eligible for removal.
+    MaybeDeadPHIs.push_back(Phi);
+    return true;
+  }
+
+  // Otherwise look for binary operator.
+  auto *BO = dyn_cast<BinaryOperator>(Index);
+  if (!BO)
+    return false;
+
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Or &&
+      BO->getOpcode() != Instruction::Mul &&
+      BO->getOpcode() != Instruction::Shl)
+    return false;
+
+  // Only support shift by constant.
+  if (BO->getOpcode() == Instruction::Shl && !isa<Constant>(BO->getOperand(1)))
+    return false;
+
+  // We need to be able to treat Or as Add.
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(BO->getOperand(0), BO->getOperand(1), *DL))
+    return false;
+
+  // We should have one operand in the loop and one splat.
+  Value *OtherOp;
+  if (isa<Instruction>(BO->getOperand(0)) &&
+      L->contains(cast<Instruction>(BO->getOperand(0)))) {
+    Index = cast<Instruction>(BO->getOperand(0));
+    OtherOp = BO->getOperand(1);
+  } else if (isa<Instruction>(BO->getOperand(1)) &&
+             L->contains(cast<Instruction>(BO->getOperand(1)))) {
+    Index = cast<Instruction>(BO->getOperand(1));
+    OtherOp = BO->getOperand(0);
+  } else {
+    return false;
+  }
+
+  // Make sure other op is loop invariant.
+  if (!L->isLoopInvariant(OtherOp))
+    return false;
+
+  // Make sure we have a splat.
+  Value *SplatOp = getSplatValue(OtherOp);
+  if (!SplatOp)
+    return false;
+
+  // Recurse up the use-def chain.
+  if (!matchStridedRecurrence(Index, L, Stride, BasePtr, Inc, Builder))
+    return false;
+
+  // Locate the Step and Start values from the recurrence.
+  unsigned StepIndex = Inc->getOperand(0) == BasePtr ? 1 : 0;
+  unsigned StartBlock = BasePtr->getOperand(0) == Inc ? 1 : 0;
+  Value *Step = Inc->getOperand(StepIndex);
+  Value *Start = BasePtr->getOperand(StartBlock);
+
+  // We need to adjust the start value in the preheader.
+  Builder.SetInsertPoint(
+      BasePtr->getIncomingBlock(StartBlock)->getTerminator());
+  Builder.SetCurrentDebugLocation(DebugLoc());
+
+  switch (BO->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case Instruction::Add:
+  case Instruction::Or: {
+    // An add only affects the start value. It's ok to do this for Or because
+    // we already checked that there are no common set bits.
+
+    // If the start value is Zero, just take the SplatOp.
+    if (isa<ConstantInt>(Start) && cast<ConstantInt>(Start)->isZero())
+      Start = SplatOp;
+    else
+      Start = Builder.CreateAdd(Start, SplatOp, "start");
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Mul: {
+    // If the start is zero we don't need to multiply.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateMul(Start, SplatOp, "start");
+
+    Step = Builder.CreateMul(Step, SplatOp, "step");
+
+    // If the Stride is 1 just take the SplatOpt.
+    if (isa<ConstantInt>(Stride) && cast<ConstantInt>(Stride)->isOne())
+      Stride = SplatOp;
+    else
+      Stride = Builder.CreateMul(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Shl: {
+    // If the start is zero we don't need to shift.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateShl(Start, SplatOp, "start");
+    Step = Builder.CreateShl(Step, SplatOp, "step");
+    Stride = Builder.CreateShl(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
+                                                   IRBuilder<> &Builder) {
+
+  SmallVector<Value *, 2> Ops(GEP->operands());
+
+  // Base pointer needs to be a scalar.
+  if (Ops[0]->getType()->isVectorTy())
+    return std::make_pair(nullptr, nullptr);
+
+  // Make sure we're in a loop and it is in loop simplify form.
+  Loop *L = LI->getLoopFor(GEP->getParent());
+  if (!L || !L->isLoopSimplifyForm())
+    return std::make_pair(nullptr, nullptr);
+
+  int VecOperand = -1;
+  unsigned TypeScale = 0;
+
+  // Look for a vector operand and scale.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    if (!Ops[i]->getType()->isVectorTy())
+      continue;
+
+    if (VecOperand >= 0)
+      return std::make_pair(nullptr, nullptr);
+
+    VecOperand = i;
+
+    TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType());
+    if (TS.isScalable())
+      return std::make_pair(nullptr, nullptr);
+
+    TypeScale = TS.getFixedSize();
+  }
+
+  // We need to find a vector index to simplify.
+  if (VecOperand < 0)
+    return std::make_pair(nullptr, nullptr);
+
+  // We can't extract the stride if the arithmetic is done at a different size
+  // than the pointer type. Adding the stride later may not wrap correctly.
+  // Technically we could handle wider indices, but I don't expect that in
+  // practice.
+  Value *VecIndex = Ops[VecOperand];
+  Type *VecIntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (VecIndex->getType() != VecIntPtrTy)
+    return std::make_pair(nullptr, nullptr);
+
+  Value *Stride;
+  BinaryOperator *Inc;
+  PHINode *BasePhi;
+  if (!matchStridedRecurrence(VecIndex, L, Stride, BasePhi, Inc, Builder))
+    return std::make_pair(nullptr, nullptr);
+
+  unsigned IncrementingBlock = BasePhi->getOperand(0) == Inc ? 0 : 1;
+
+  Builder.SetInsertPoint(GEP);
+
+  // Replace the vector index with the scalar phi and build a scalar GEP.
+  Ops[VecOperand] = BasePhi;
+  Type *SourceTy = GEP->getSourceElementType();
+  Value *BasePtr =
+      Builder.CreateGEP(SourceTy, Ops[0], makeArrayRef(Ops).drop_front());
+
+  // Cast the GEP to an i8*.
+  LLVMContext &Ctx = GEP->getContext();
+  Type *I8PtrTy =
+      Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
+  if (BasePtr->getType() != I8PtrTy)
+    BasePtr = Builder.CreatePointerCast(BasePtr, I8PtrTy);
+
+  // Final adjustments to stride should go in the start block.
+  Builder.SetInsertPoint(
+      BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator());
+
+  // Convert stride to pointer size if needed.
+  Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
+  assert(Stride->getType() == IntPtrTy && "Unexpected type");
+
+  // Scale the stride by the size of the indexed type.
+  if (TypeScale != 1)
+    Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));
+
+  auto BaseAndStride = std::make_pair(BasePtr, Stride);
+
+  return BaseAndStride;
+}
+
+bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
+                                                           Type *DataType,
+                                                           Value *Ptr,
+                                                           Value *AlignOp) {
+  // Make sure the operation will be supported by the backend.
+  if (!isLegalTypeAndAlignment(DataType, AlignOp))
+    return false;
+
+  // Pointer should be a GEP.
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return false;
+
+  IRBuilder<> Builder(GEP);
+
+  Value *BasePtr, *Stride;
+  std::tie(BasePtr, Stride) = determineBaseAndStride(GEP, Builder);
+  if (!BasePtr)
+    return false;
+  assert(Stride != nullptr);
+
+  Builder.SetInsertPoint(II);
+
+  CallInst *Call;
+  if (II->getIntrinsicID() == Intrinsic::masked_gather)
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_load,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
+  else
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_store,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
+
+  Call->takeName(II);
+  II->replaceAllUsesWith(Call);
+  II->eraseFromParent();
+
+  if (GEP->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+
+  return true;
+}
+
+bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<RISCVTargetMachine>();
+  ST = &TM.getSubtarget<RISCVSubtarget>(F);
+  if (!ST->hasStdExtV() || !ST->useRVVForFixedLengthVectors())
+    return false;
+
+  TLI = ST->getTargetLowering();
+  DL = &F.getParent()->getDataLayout();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  SmallVector<IntrinsicInst *, 4> Gathers;
+  SmallVector<IntrinsicInst *, 4> Scatters;
+
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Do an initial optimization pass to push out as much address arithmetic
+      // as possible to get a more canonical IR.
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
+          isa<FixedVectorType>(II->getType())) {
+        Gathers.push_back(II);
+      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
+                 isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
+        Scatters.push_back(II);
+      }
+    }
+  }
+
+  // Rewrite gather/scatter to form strided load/store if possible.
+  for (auto *II : Gathers)
+    Changed |= tryCreateStridedLoadStore(
+        II, II->getType(), II->getArgOperand(0), II->getArgOperand(1));
+  for (auto *II : Scatters)
+    Changed |=
+        tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(),
+                                  II->getArgOperand(1), II->getArgOperand(2));
+
+  // Remove any dead phis.
+  while (!MaybeDeadPHIs.empty()) {
+    if (auto *Phi = dyn_cast_or_null<PHINode>(MaybeDeadPHIs.pop_back_val()))
+      RecursivelyDeleteDeadPHINode(Phi);
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -508,6 +508,8 @@
 
   bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
 
+  bool isLegalElementTypeForRVV(Type *ScalarTy) const;
+
 private:
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
@@ -556,6 +558,7 @@
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -429,6 +429,7 @@
     }
 
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
     static unsigned IntegerVPOps[] = {
         ISD::VP_ADD,  ISD::VP_SUB,  ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV,
@@ -917,6 +918,23 @@
                  MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::riscv_masked_strided_load:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.ptrVal = nullptr;
+    Info.memVT = MVT::getVT(I.getType()->getScalarType());
+    Info.align = Align(I.getType()->getScalarSizeInBits() / 8);
+    Info.size = ~UINT64_C(0);
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::riscv_masked_strided_store:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.ptrVal = nullptr;
+    Info.memVT = MVT::getVT(I.getArgOperand(0)->getType()->getScalarType());
+    Info.align =
+        Align(I.getArgOperand(0)->getType()->getScalarSizeInBits() / 8);
+    Info.size = ~UINT64_C(0);
+    Info.flags |= MachineMemOperand::MOStore;
+    return true;
   }
 }
 
@@ -1190,6 +1208,24 @@
          (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
 }
 
+bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const {
+  if (ScalarTy->isPointerTy())
+    return true;
+
+  if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
+      ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
+    return true;
+
+  if (ScalarTy->isHalfTy())
+    return Subtarget.hasStdExtZfh();
+  if (ScalarTy->isFloatTy())
+    return Subtarget.hasStdExtF();
+  if (ScalarTy->isDoubleTy())
+    return Subtarget.hasStdExtD();
+
+  return false;
+}
+
 static bool useRVVForFixedLengthVectorVT(MVT VT,
                                          const RISCVSubtarget &Subtarget) {
   assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
@@ -2252,6 +2288,8 @@
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:
     return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+    return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::BSWAP:
   case ISD::BITREVERSE: {
     // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
@@ -3799,9 +3837,109 @@
 
 SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_load: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    MVT VT = Op->getSimpleValueType(0);
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    SDValue PassThru = Op.getOperand(2);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+      PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
+        XLenVT);
+
+    auto *Load = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
+    if (!IsUnmasked)
+      Ops.push_back(PassThru);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+
+    SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+    SDValue Result =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                                Load->getMemoryVT(), Load->getMemOperand());
+    SDValue Chain = Result.getValue(1);
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  }
+
   return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_store: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    SDValue Val = Op.getOperand(2);
+    MVT VT = Val.getSimpleValueType();
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL,
+        XLenVT);
+
+    auto *Store = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Store->getChain(), IntID};
+    Ops.push_back(Val);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+
+    return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(),
+                                   Ops, Store->getMemoryVT(),
+                                   Store->getMemOperand());
+  }
+  }
+
+  return SDValue();
+}
+
 static MVT getLMUL1VT(MVT VT) {
   assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
          "Unexpected vector MVT");
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -37,6 +37,7 @@
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
@@ -149,6 +150,9 @@
 
 void RISCVPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass());
+
+  addPass(createRISCVGatherScatterLoweringPass());
+
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -78,24 +78,6 @@
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
-  bool isLegalElementTypeForRVV(Type *ScalarTy) const {
-    if (ScalarTy->isPointerTy())
-      return true;
-
-    if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
-        ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
-      return true;
-
-    if (ScalarTy->isHalfTy())
-      return ST->hasStdExtZfh();
-    if (ScalarTy->isFloatTy())
-      return ST->hasStdExtF();
-    if (ScalarTy->isDoubleTy())
-      return ST->hasStdExtD();
-
-    return false;
-  }
-
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
     if (!ST->hasStdExtV())
       return false;
@@ -114,7 +96,7 @@
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -142,7 +124,7 @@
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -169,7 +151,7 @@
       return true;
 
     Type *Ty = RdxDesc.getRecurrenceType();
-    if (!isLegalElementTypeForRVV(Ty))
+    if (!TLI->isLegalElementTypeForRVV(Ty))
       return false;
 
     switch (RdxDesc.getRecurrenceKind()) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s
+
+; This contains negative tests for the strided load/store recognition in
+; RISCVGatherScatterLowering.cpp
+
+; Negative test for treating OR as ADD.
+define void @gather_bad_or(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_bad_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %or = or <32 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %i3 = bitcast i8* %i2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %i5 = bitcast i8* %i2 to <32 x i8>*
+  store <32 x i8> %i4, <32 x i8>* %i5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; Don't transform since we might not handle wrap correctly with narrow indices.
+define void @gather_narrow_index(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_narrow_index(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw <32 x i32> [[VEC_IND]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i32> [[TMP0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[TMP1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP4]], <32 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i32> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i32> %vec.ind, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; The last element of the start value of the phi has the wrong stride.
+define void @gather_broken_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_broken_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 32>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 32>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %or = or <32 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %i3 = bitcast i8* %i2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %i5 = bitcast i8* %i2 to <32 x i8>*
+  store <32 x i8> %i4, <32 x i8>* %i5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll
@@ -0,0 +1,831 @@
+; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefix=CHECK-ASM
+
+%struct.foo = type { i32, i32, i32, i32 }
+
+; void gather(signed char * __restrict  A, signed char * __restrict B) {
+;   for (int i = 0; i != 1024; ++i)
+;       A[i] += B[i * 5];
+; }
+define void @gather(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB0_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB0_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
+; CHECK-LABEL: @gather_masked(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_masked:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    lui a3, 983765
+; CHECK-ASM-NEXT:    addiw a3, a3, 873
+; CHECK-ASM-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-ASM-NEXT:    vmv.s.x v0, a3
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB1_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, tu, mu
+; CHECK-ASM-NEXT:    vmv1r.v v25, v8
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4, v0.t
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB1_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_negative_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 -5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_negative_stride:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a1, a1, 155
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, -5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB2_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB2_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_zero_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_zero_stride:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a3, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 1024
+; CHECK-ASM-NEXT:  .LBB3_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), zero
+; CHECK-ASM-NEXT:    add a5, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a5)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a5)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a4, .LBB3_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void scatter(signed char * __restrict  A, signed char * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i * 5] += B[i];
+;}
+define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @scatter(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP2]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB4_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    add a3, a1, a2
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a3)
+; CHECK-ASM-NEXT:    vlse8.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse8.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB4_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %B, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %1, align 1
+  %2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %4 = add <32 x i8> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
+; CHECK-LABEL: @scatter_masked(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP2]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter_masked:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    lui a4, 983765
+; CHECK-ASM-NEXT:    addiw a4, a4, 873
+; CHECK-ASM-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-ASM-NEXT:    vmv.s.x v0, a4
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB5_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    add a3, a1, a2
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a3)
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
+; CHECK-ASM-NEXT:    vmv1r.v v26, v8
+; CHECK-ASM-NEXT:    vlse8.v v26, (a0), a4, v0.t
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse8.v v25, (a0), a4, v0.t
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB5_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %B, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %1, align 1
+  %2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
+  %4 = add <32 x i8> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; void gather_pow2(signed char * __restrict  A, signed char * __restrict B) {
+;   for (int i = 0; i != 1024; ++i)
+;       A[i] += B[i * 4];
+; }
+define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_pow2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_pow2:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 16
+; CHECK-ASM-NEXT:    addi a4, zero, 32
+; CHECK-ASM-NEXT:  .LBB6_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a1), a3
+; CHECK-ASM-NEXT:    vsetvli zero, a4, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v26, (a0)
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsetvli zero, a4, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vse8.v v25, (a0)
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a0, a0, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB6_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, i64 %index
+  %3 = bitcast i32* %2 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %3, align 1
+  %4 = add <8 x i32> %wide.load, %wide.masked.gather
+  %5 = bitcast i32* %2 to <8 x i32>*
+  store <8 x i32> %4, <8 x i32>* %5, align 1
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void scatter_pow2(signed char * __restrict  A, signed char * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i * 4] += B[i];
+;}
+define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @scatter_pow2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter_pow2:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 16
+; CHECK-ASM-NEXT:  .LBB7_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a1)
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a1, a1, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB7_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %B, i64 %index
+  %1 = bitcast i32* %0 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %1, align 1
+  %2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %3 = getelementptr inbounds i32, i32* %A, <8 x i64> %2
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %4 = add <8 x i32> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %4, <8 x i32*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;struct foo {
+;  int a, b, c, d;
+;};
+;
+;void struct_gather(int * __restrict  A, struct foo * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i] += B[i].b;
+;}
+define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocapture readonly %B) {
+; CHECK-LABEL: @struct_gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO]], %struct.foo* [[B]], i64 [[VEC_IND_SCALAR1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_LOAD10]], [[WIDE_MASKED_GATHER9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP9]], <8 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: struct_gather:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a0, a0, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 132
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 16
+; CHECK-ASM-NEXT:  .LBB8_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    addi a4, a1, -128
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a4), a3
+; CHECK-ASM-NEXT:    vlse32.v v26, (a1), a3
+; CHECK-ASM-NEXT:    addi a4, a0, -32
+; CHECK-ASM-NEXT:    vle32.v v27, (a4)
+; CHECK-ASM-NEXT:    vle32.v v28, (a0)
+; CHECK-ASM-NEXT:    vadd.vv v25, v27, v25
+; CHECK-ASM-NEXT:    vadd.vv v26, v28, v26
+; CHECK-ASM-NEXT:    vse32.v v25, (a4)
+; CHECK-ASM-NEXT:    vse32.v v26, (a0)
+; CHECK-ASM-NEXT:    addi a2, a2, -16
+; CHECK-ASM-NEXT:    addi a0, a0, 64
+; CHECK-ASM-NEXT:    addi a1, a1, 256
+; CHECK-ASM-NEXT:    bnez a2, .LBB8_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %0 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %vec.ind, i32 1
+  %1 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %step.add, i32 1
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %0, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, i64 %index
+  %3 = bitcast i32* %2 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %2, i64 8
+  %5 = bitcast i32* %4 to <8 x i32>*
+  %wide.load10 = load <8 x i32>, <8 x i32>* %5, align 4
+  %6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
+  %7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
+  %8 = bitcast i32* %2 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %8, align 4
+  %9 = bitcast i32* %4 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %9, align 4
+  %index.next = add nuw i64 %index, 16
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %10 = icmp eq i64 %index.next, 1024
+  br i1 %10, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void gather_unroll(int * __restrict  A, int * __restrict B) {
+;  for (int i = 0; i < 1024; i+= 4 ) {
+;    A[i] += B[i * 4];
+;    A[i+1] += B[(i+1) * 4];
+;    A[i+2] += B[(i+2) * 4];
+;    A[i+3] += B[(i+3) * 4];
+;  }
+;}
+define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR3:%.*]] = phi i64 [ 4, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR5:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR7:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR9:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR15:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR17:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR19:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR21:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP8]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR17]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP10]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP13]], i8* [[TMP12]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP15]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to i8*
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR19]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP17]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP20]], i8* [[TMP19]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR11]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP22]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR13]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to i8*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP24]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP27]], i8* [[TMP26]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR4]] = add i64 [[VEC_IND_SCALAR3]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR6]] = add i64 [[VEC_IND_SCALAR5]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR8]] = add i64 [[VEC_IND_SCALAR7]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR10]] = add i64 [[VEC_IND_SCALAR9]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR12]] = add i64 [[VEC_IND_SCALAR11]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR14]] = add i64 [[VEC_IND_SCALAR13]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR16]] = add i64 [[VEC_IND_SCALAR15]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR18]] = add i64 [[VEC_IND_SCALAR17]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR20]] = add i64 [[VEC_IND_SCALAR19]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR22]] = add i64 [[VEC_IND_SCALAR21]], 32
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP28]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_unroll:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 256
+; CHECK-ASM-NEXT:    addi a3, zero, 64
+; CHECK-ASM-NEXT:    addi a4, zero, 16
+; CHECK-ASM-NEXT:  .LBB9_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a1), a3
+; CHECK-ASM-NEXT:    vlse32.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 16
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 4
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 32
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 8
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 48
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 12
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a1, a1, 512
+; CHECK-ASM-NEXT:    addi a0, a0, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB9_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, <8 x i64> %vec.ind
+  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %3, <8 x i32*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %4 = or <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %5 = shl nsw <8 x i64> %4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %6 = getelementptr inbounds i32, i32* %B, <8 x i64> %5
+  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %7 = getelementptr inbounds i32, i32* %A, <8 x i64> %4
+  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %8, <8 x i32*> %7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %9 = or <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %10 = shl nsw <8 x i64> %9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %11 = getelementptr inbounds i32, i32* %B, <8 x i64> %10
+  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %12 = getelementptr inbounds i32, i32* %A, <8 x i64> %9
+  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %13, <8 x i32*> %12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %14 = or <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
+  %15 = shl nsw <8 x i64> %14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %16 = getelementptr inbounds i32, i32* %B, <8 x i64> %15
+  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %17 = getelementptr inbounds i32, i32* %A, <8 x i64> %14
+  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %18, <8 x i32*> %17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %19 = icmp eq i64 %index.next, 256
+  br i1 %19, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32 immarg, <8 x i1>, <8 x i32>)
+declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32 immarg, <32 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immarg, <8 x i1>)
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -507,9 +507,10 @@
     return false;
 
   std::vector<StringRef> PassNamePrefix = {
-      "x86-",  "xcore-", "wasm-",    "systemz-", "ppc-",    "nvvm-",   "nvptx-",
-      "mips-", "lanai-", "hexagon-", "bpf-",     "avr-",    "thumb2-", "arm-",
-      "si-",   "gcn-",   "amdgpu-",  "aarch64-", "amdgcn-", "polly-"};
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-"};
   std::vector<StringRef> PassNameContain = {"ehprepare"};
   std::vector<StringRef> PassNameExact = {
       "safe-stack",           "cost-model",