Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -743,6 +743,8 @@
   bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked load.
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked load.
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const;
 
   /// Return true if the target supports nontemporal store.
   bool isLegalNTStore(Type *DataType, Align Alignment) const;
@@ -757,6 +759,8 @@
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked gather prefetch.
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const;
   /// Return true if the target forces scalarizing of llvm.masked.gather
   /// intrinsics.
   bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const;
@@ -1769,12 +1773,14 @@
     getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
   virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalBroadcastLoad(Type *ElementTy,
                                     ElementCount NumElements) const = 0;
   virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) = 0;
   virtual bool forceScalarizeMaskedGather(VectorType *DataType,
                                           Align Alignment) = 0;
   virtual bool forceScalarizeMaskedScatter(VectorType *DataType,
@@ -2225,6 +2231,9 @@
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedLoad(DataType, Alignment);
   }
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedPrefetch(DataType, Alignment);
+  }
   bool isLegalNTStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTStore(DataType, Alignment);
   }
@@ -2241,6 +2250,9 @@
   bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedGather(DataType, Alignment);
   }
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedGatherPrefetch(DataType, Alignment);
+  }
   bool forceScalarizeMaskedGather(VectorType *DataType,
                                   Align Alignment) override {
     return Impl.forceScalarizeMaskedGather(DataType, Alignment);
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -254,6 +254,10 @@
     return false;
   }
 
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   bool isLegalNTStore(Type *DataType, Align Alignment) const {
     // By default, assume nontemporal memory stores are available for stores
     // that are aligned and have a size that is a power of 2.
@@ -280,6 +284,10 @@
     return false;
   }
 
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const {
     return false;
   }
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1556,6 +1556,15 @@
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
+    case Intrinsic::masked_gather_prefetch: {
+      const Value *Mask = Args[4];
+      bool VarMask = !isa<Constant>(Mask);
+      Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
+      auto *MaskVT = cast<VectorType>(Mask->getType());
+      auto *PsudoDataTy = MaskVT->getWithNewBitWidth(Alignment.value()*8);
+      return thisT()->getGatherScatterOpCost(Instruction::Call, PsudoDataTy, Args[0],
+                                             VarMask, Alignment, CostKind, I);
+    }
     case Intrinsic::experimental_stepvector: {
       if (isa<ScalableVectorType>(RetTy))
         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -1870,6 +1879,13 @@
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
+    case Intrinsic::masked_prefetch: {
+      auto *MaskVT = cast<VectorType>(ICA.getArgTypes()[4]);
+      Type *PsudoTy = MaskVT->getWithNewBitWidth(32);
+      Align TyAlign = thisT()->DL.getABITypeAlign(PsudoTy);
+      return thisT()->getMaskedMemoryOpCost(Instruction::Call, PsudoTy, TyAlign, 0,
+                                            CostKind);
+    }
     case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                                  std::nullopt, CostKind);
Index: llvm/include/llvm/IR/IRBuilder.h
===================================================================
--- llvm/include/llvm/IR/IRBuilder.h
+++ llvm/include/llvm/IR/IRBuilder.h
@@ -792,6 +792,11 @@
   CallInst *CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment,
                               Value *Mask);
 
+  /// Create a call to Masked Load intrinsic
+  CallInst *CreateMaskedPrefetch(Type *Ty, Value *Ptr, Align Alignment, Value *Mask,
+                                 Value *RW = nullptr, Value *Locality = nullptr,
+                                 const Twine &Name = "");
+
   /// Create a call to Masked Gather intrinsic
   CallInst *CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment,
                                Value *Mask = nullptr, Value *PassThru = nullptr,
@@ -801,6 +806,12 @@
   CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, Align Alignment,
                                 Value *Mask = nullptr);
 
+  /// Create a call to Masked Gather Prefetch intrinsic
+  CallInst *CreateMaskedGatherPrefetch(Type *Ty, Value *Ptrs, Align Alignment,
+                                       Value *Mask = nullptr, Value *RW = nullptr,
+                                       Value *Locality = nullptr,
+                                       const Twine &Name = "");
+
   /// Create a call to Masked Expand Load intrinsic
   CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr,
                                    Value *PassThru = nullptr,
Index: llvm/include/llvm/IR/IntrinsicInst.h
===================================================================
--- llvm/include/llvm/IR/IntrinsicInst.h
+++ llvm/include/llvm/IR/IntrinsicInst.h
@@ -1328,6 +1328,106 @@
   }
 };
 
+/// This class prefetch intrinsic
+/// i.e. llvm.prefetch
+class PrefetchInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::prefetch;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  Value *getPointerOperand() { return getOperand(0); }
+  const Value *getPointerOperand() const { return getOperand(0); }
+  static unsigned getPointerOperandIndex() { return 0U; }
+  Type *getPointerOperandType() const { return getPointerOperand()->getType(); }
+};
+
+/// A helper function that returns the pointer operand of a prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getPrefetchPointerOperand(const Value *V) {
+  if (auto *Prefetch = dyn_cast<PrefetchInst>(V))
+    return Prefetch->getPointerOperand();
+  return nullptr;
+}
+inline Value *getPrefetchPointerOperand(Value *V) {
+  return const_cast<Value *>(
+      getPrefetchPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// prefetch instruction.
+inline unsigned getPrefetchAddressSpace(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected prefetch instruction");
+  auto *PtrTy = dyn_cast<PrefetchInst>(I)->getPointerOperandType();
+  return dyn_cast<PointerType>(PtrTy)->getAddressSpace();
+}
+
+/// A helper function that returns the type of a prefetch instruction.
+inline Type *getPrefetchType(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+  auto *Prefetch = dyn_cast<PrefetchInst>(I);
+  auto *GEP = dyn_cast<GetElementPtrInst>(Prefetch->getPointerOperand());
+  if (GEP) {
+    auto *ElemTy = GEP->getSourceElementType();
+    auto *TempTy = dyn_cast<ArrayType>(ElemTy);
+    while (TempTy) {
+      ElemTy = TempTy->getArrayElementType();
+      TempTy = dyn_cast<ArrayType>(ElemTy);
+    }
+    return isa<StructType>(ElemTy) ? Type::getInt64Ty(I->getContext()) : ElemTy;
+  }
+  auto *Alloca = dyn_cast<AllocaInst>(Prefetch->getPointerOperand());
+  if (Alloca) {
+    auto *ElemTy = Alloca->getAllocatedType()->getArrayElementType();
+    return isa<StructType>(ElemTy) ? Type::getInt64Ty(I->getContext()) : ElemTy;
+  }
+  return nullptr;
+}
+
+/// A helper function that returns the alignment of prefetch instruction.
+inline Align getPrefetchAlignment(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+  auto *Ty = getPrefetchType(I);
+  return Ty? Align(Ty->getScalarSizeInBits()>>3) : Align(1ULL);
+}
+
+/// A helper function that returns the alignment of load/store/prefetch instruction.
+inline Align getLdStPfAlignment(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchAlignment(I);
+  return getLoadStoreAlignment(I);
+}
+
+/// A helper function that returns the pointer operand of a load/store/prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getLdStPfPointerOperand(const Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchPointerOperand(I);
+  return getLoadStorePointerOperand(I);
+}
+inline Value *getLdStPfPointerOperand(Value *V) {
+  return const_cast<Value *>(
+      getLdStPfPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load/store/prefetch instruction.
+inline unsigned getLdStPfAddressSpace(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchAddressSpace(I);
+  return getLoadStoreAddressSpace(I);
+}
+
+/// A helper function that returns the type of a load/store/prefetch instruction.
+inline Type *getLdStPfType(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchType(I);
+  return getLoadStoreType(I);
+}
+
 /// This class represents any memmove intrinsic
 /// i.e. llvm.element.unordered.atomic.memmove
 ///  and llvm.memmove
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -2208,6 +2208,21 @@
             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_prefetch:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyptr_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
+def int_masked_gather_prefetch:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyvector_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrWillReturn, IntrSpeculatable]>;
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -433,6 +433,11 @@
   return TTIImpl->isLegalMaskedLoad(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalMaskedPrefetch(Type *DataType,
+                                                Align Alignment) const {
+  return TTIImpl->isLegalMaskedPrefetch(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalNTStore(Type *DataType,
                                          Align Alignment) const {
   return TTIImpl->isLegalNTStore(DataType, Alignment);
@@ -481,6 +486,11 @@
   return TTIImpl->isLegalMaskedExpandLoad(DataType);
 }
 
+bool TargetTransformInfo::isLegalMaskedGatherPrefetch(Type *DataType,
+                                                      Align Alignment) const {
+  return TTIImpl->isLegalMaskedGatherPrefetch(DataType, Alignment);
+}
+
 bool TargetTransformInfo::enableOrderedReductions() const {
   return TTIImpl->enableOrderedReductions();
 }
Index: llvm/lib/Analysis/VectorUtils.cpp
===================================================================
--- llvm/lib/Analysis/VectorUtils.cpp
+++ llvm/lib/Analysis/VectorUtils.cpp
@@ -92,6 +92,7 @@
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::prefetch:
     return true;
   default:
     return false;
Index: llvm/lib/IR/IRBuilder.cpp
===================================================================
--- llvm/lib/IR/IRBuilder.cpp
+++ llvm/lib/IR/IRBuilder.cpp
@@ -602,6 +602,30 @@
   return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
 }
 
+/// Create a call to a Masked Prefetch intrinsic.
+/// \p Ty        - vector type to load
+/// \p Ptr       - base pointer for the load
+/// \p Alignment - alignment of the destination location
+/// \p Mask      - vector of booleans which indicates what vector lanes should
+///                be accessed in memory
+/// \p RW        - Read or Write
+/// \p Locality  - Cache Level
+/// \p Name      - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedPrefetch(Type *Ty, Value *Ptr,
+                                              Align Alignment,
+                                              Value *Mask, Value *RW,
+                                              Value *Locality,
+                                              const Twine &Name) {
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
+  assert(Ty->isVectorTy() && "Type should be vector");
+  assert(PtrTy->isOpaqueOrPointeeTypeMatches(Ty) && "Wrong element type");
+  assert(Mask && "Mask should not be all-ones (null)");
+  Type *OverloadedTypes[] = {PtrTy, Mask->getType()};
+  Value *Ops[] = {Ptr, getInt32(Alignment.value()), RW, Locality, Mask};
+  return CreateMaskedIntrinsic(Intrinsic::masked_prefetch, Ops,
+                               OverloadedTypes, Name);
+}
+
 /// Create a call to a Masked intrinsic, with given intrinsic Id,
 /// an array of operands - Ops, and an array of overloaded types -
 /// OverloadedTypes.
@@ -708,6 +732,41 @@
                                OverloadedTypes);
 }
 
+/// Create a call to a Masked Gather Prefetch intrinsic.
+/// \p Ty       - vector type to gather
+/// \p Ptrs     - vector of pointers for loading
+/// \p Align    - alignment of the destination location
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p RW       - Read or Write
+/// \p Locality - Cache Level
+/// \p Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedGatherPrefetch(Type *Ty, Value *Ptrs, Align Alignment,
+                                                    Value *Mask, Value *RW,
+                                                    Value *Locality,
+                                                    const Twine &Name) {
+  auto *VecTy = cast<VectorType>(Ty);
+  ElementCount NumElts = VecTy->getElementCount();
+  auto *PtrsTy = cast<VectorType>(Ptrs->getType());
+  assert(cast<PointerType>(PtrsTy->getElementType())
+             ->isOpaqueOrPointeeTypeMatches(
+                 cast<VectorType>(Ty)->getElementType()) &&
+         "Element type mismatch");
+  assert(NumElts == PtrsTy->getElementCount() && "Element count mismatch");
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(
+        VectorType::get(Type::getInt1Ty(Context), NumElts));
+
+  Type *OverloadedTypes[] = {PtrsTy};
+  Value *Ops[] = {Ptrs, getInt32(Alignment.value()), RW, Locality, Mask};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather_prefetch, Ops, OverloadedTypes,
+                               Name);
+}
+
 template <typename T0>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -600,7 +600,7 @@
 
 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
                                                ElementCount VF) const {
-  Value *Ptr = getLoadStorePointerOperand(&I);
+  Value *Ptr = getLdStPfPointerOperand(&I);
   if (!Ptr)
     return false;
   // Note: There's nothing inherent which prevents predicated loads and
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1470,19 +1470,28 @@
            TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
+  /// Returns true if the target machine supports masked prefetch operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedPrefetch(Type *DataType, Value *Ptr, Align Alignment) const {
+    return Legal->isConsecutivePtr(DataType, Ptr) &&
+           TTI.isLegalMaskedPrefetch(DataType, Alignment);
+  }
+
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
-    if (!LI && !SI)
+    bool PF = isa<PrefetchInst>(V);
+    if (!LI && !SI && !PF)
       return false;
-    auto *Ty = getLoadStoreType(V);
-    Align Align = getLoadStoreAlignment(V);
+    auto *Ty = getLdStPfType(V);
+    Align Align = getLdStPfAlignment(V);
     if (VF.isVector())
       Ty = VectorType::get(Ty, VF);
     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
-           (SI && TTI.isLegalMaskedScatter(Ty, Align));
+           (SI && TTI.isLegalMaskedScatter(Ty, Align)) ||
+           (PF && TTI.isLegalMaskedPrefetch(Ty, Align));
   }
 
   /// Returns true if the target machine supports all of the reduction
@@ -4401,8 +4410,18 @@
   switch(I->getOpcode()) {
   default:
     return true;
-  case Instruction::Call:
-    return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+  case Instruction::Call: {
+    if (!isa<PrefetchInst>(I))
+      return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+    auto *Ptr = getPrefetchPointerOperand(I);
+    auto *Ty = getPrefetchType(I);
+    Type *VTy = Ty;
+    if (VF.isVector())
+      VTy = VectorType::get(Ty, VF);
+    const Align Alignment = getPrefetchAlignment(I);
+    return !(isLegalMaskedPrefetch(Ty, Ptr, Alignment) ||
+             TTI.isLegalMaskedGatherPrefetch(VTy, Alignment));
+  }
   case Instruction::Load:
   case Instruction::Store: {
     auto *Ptr = getLoadStorePointerOperand(I);
@@ -4609,10 +4628,10 @@
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
     Instruction *I, ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
-  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+  assert((isa<LoadInst, StoreInst, PrefetchInst>(I)) && "Invalid memory instruction");
 
-  auto *Ptr = getLoadStorePointerOperand(I);
-  auto *ScalarTy = getLoadStoreType(I);
+  auto *Ptr = getLdStPfPointerOperand(I);
+  auto *ScalarTy = getLdStPfType(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
@@ -6469,11 +6488,11 @@
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto SE = PSE.getSE();
 
-  unsigned AS = getLoadStoreAddressSpace(I);
-  Value *Ptr = getLoadStorePointerOperand(I);
+  unsigned AS = getLdStPfAddressSpace(I);
+  Value *Ptr = getLdStPfPointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
   //       that it is being called from this specific place.
@@ -6489,7 +6508,7 @@
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  const Align Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLdStPfAlignment(I);
   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
                                                       ValTy->getScalarType(),
                                                       Alignment, AS, CostKind);
@@ -6524,16 +6543,16 @@
 InstructionCost
 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                     ElementCount VF) {
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  Value *Ptr = getLoadStorePointerOperand(I);
-  unsigned AS = getLoadStoreAddressSpace(I);
+  Value *Ptr = getLdStPfPointerOperand(I);
+  unsigned AS = getLdStPfAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
-  const Align Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLdStPfAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6556,11 +6575,16 @@
                                                 ElementCount VF) {
   assert(Legal->isUniformMemOp(*I, VF));
 
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  const Align Alignment = getLoadStoreAlignment(I);
-  unsigned AS = getLoadStoreAddressSpace(I);
+  const Align Alignment = getLdStPfAlignment(I);
+  unsigned AS = getLdStPfAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  if (isa<PrefetchInst>(I)) {
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(Instruction::Call, ValTy, Alignment, AS,
+                               CostKind);
+  }
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
@@ -6582,10 +6606,10 @@
 InstructionCost
 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                  ElementCount VF) {
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  const Align Alignment = getLoadStoreAlignment(I);
-  const Value *Ptr = getLoadStorePointerOperand(I);
+  const Align Alignment = getLdStPfAlignment(I);
+  const Value *Ptr = getLdStPfPointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(
@@ -6811,9 +6835,9 @@
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
   if (VF.isScalar()) {
-    Type *ValTy = getLoadStoreType(I);
-    const Align Alignment = getLoadStoreAlignment(I);
-    unsigned AS = getLoadStoreAddressSpace(I);
+    Type *ValTy = getLdStPfType(I);
+    const Align Alignment = getLdStPfAlignment(I);
+    unsigned AS = getLdStPfAddressSpace(I);
 
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
     return TTI.getAddressComputationCost(ValTy) +
@@ -6914,7 +6938,7 @@
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the old loop.
     for (Instruction &I : *BB) {
-      Value *Ptr =  getLoadStorePointerOperand(&I);
+      Value *Ptr = getLdStPfPointerOperand(&I);
       if (!Ptr)
         continue;
 
@@ -6974,7 +6998,7 @@
       if (memoryInstructionCanBeWidened(&I, VF)) {
         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
         int ConsecutiveStride = Legal->isConsecutivePtr(
-            getLoadStoreType(&I), getLoadStorePointerOperand(&I));
+            getLdStPfType(&I), getLdStPfPointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
@@ -7408,6 +7432,20 @@
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
+    if (isa<PrefetchInst>(I)) {
+      ElementCount Width = VF;
+      if (Width.isVector()) {
+        InstWidening Decision = getWideningDecision(I, Width);
+        assert(Decision != CM_Unknown &&
+               "CM decision should be taken at this point");
+        if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+          return InstructionCost::getInvalid();
+        if (Decision == CM_Scalarize)
+          Width = ElementCount::getFixed(1);
+      }
+      VectorTy = ToVectorTy(getLdStPfType(I), Width);
+      return getMemoryInstructionCost(I, VF);
+    }
     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
         return *RedCost;
@@ -8240,7 +8278,7 @@
                                                 ArrayRef<VPValue *> Operands,
                                                 VFRange &Range,
                                                 VPlanPtr &Plan) {
-  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I) || isa<PrefetchInst>(I)) &&
          "Must be called with either a load or store");
 
   auto willWiden = [&](ElementCount VF) -> bool {
@@ -8275,6 +8313,10 @@
     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
                                               Consecutive, Reverse);
 
+  if (PrefetchInst *Prefetch = dyn_cast<PrefetchInst>(I))
+    return new VPWidenMemoryInstructionRecipe(*Prefetch, Operands[0], Mask,
+                                              Consecutive, Reverse);
+
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
                                             Mask, Consecutive, Reverse);
@@ -8694,10 +8736,12 @@
           [&](ElementCount VF) { return VF.isScalar(); }, Range))
     return nullptr;
 
-  if (auto *CI = dyn_cast<CallInst>(Instr))
+  if (isa<CallInst>(Instr) && !isa<PrefetchInst>(Instr)) {
+    auto *CI = dyn_cast<CallInst>(Instr);
     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
+  }
 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr) || isa<PrefetchInst>(Instr))
     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
 
   if (!shouldWiden(Instr, Range))
@@ -9626,7 +9670,7 @@
   if (IsUniform) {
     // If the recipe is uniform across all parts (instead of just per VF), only
     // generate a single instance.
-    if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
+    if ((isa<LoadInst>(UI) || isa<StoreInst>(UI) || isa<PrefetchInst>(UI)) &&
         all_of(operands(), [](VPValue *Op) {
           return Op->isDefinedOutsideVectorRegions();
         })) {
@@ -9656,6 +9700,16 @@
     return;
   }
 
+  // A prefetch of a loop varying value to a uniform address only needs the last
+  // copy of the store.
+  if (isa<PrefetchInst>(UI) &&
+      vputils::isUniformAfterVectorization(getOperand(0))) {
+    auto Lane = VPLane::getLastLaneForVF(State.VF);
+    State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
+                                    State);
+    return;
+  }
+
   // Generate scalar instances for all VF lanes of all UF parts.
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   const unsigned EndLane = State.VF.getKnownMinValue();
@@ -9670,15 +9724,17 @@
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+  PrefetchInst *PF = dyn_cast<PrefetchInst>(&Ingredient);
 
-  assert((LI || SI) && "Invalid Load/Store instruction");
+  assert((LI || SI || PF) && "Invalid Load/Store/Prefetch instruction");
   assert((!SI || StoredValue) && "No stored value provided for widened store");
   assert((!LI || !StoredValue) && "Stored value provided for widened load");
+  assert((!PF || !StoredValue) && "Stored value provided for widened prefetch");
 
-  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  Type *ScalarDataTy = getLdStPfType(&Ingredient);
 
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
-  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  const Align Alignment = getLdStPfAlignment(&Ingredient);
   bool CreateGatherScatter = !isConsecutive();
 
   auto &Builder = State.Builder;
@@ -9762,6 +9818,37 @@
     return;
   }
 
+  if (PF) {
+    State.setDebugLocFromInst(PF);
+    Value *RW = PF->getArgOperand(1);
+    Value *Locality = PF->getArgOperand(2);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Instruction *NewPF = nullptr;
+      if (CreateGatherScatter) {
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *VectorGep = State.get(getAddr(), Part);
+        NewPF = Builder.CreateMaskedGatherPrefetch(DataTy, VectorGep, Alignment,
+                                                   MaskPart, RW, Locality);
+      } else {
+        auto *VecPtr =
+            CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+        if (isMaskRequired)
+          NewPF = Builder.CreateMaskedPrefetch(DataTy, VecPtr, Alignment,
+                                               BlockInMaskParts[Part],
+                                               RW, Locality);
+        else {
+          auto *MaskPart = Constant::getAllOnesValue(
+              VectorType::get(Type::getInt1Ty(DataTy->getContext()), DataTy));
+          NewPF = Builder.CreateMaskedPrefetch(DataTy, VecPtr, Alignment,
+                                               MaskPart, RW, Locality);
+        }
+      }
+      State.addMetadata(NewPF, PF);
+    }
+    return;
+  }
+
   // Handle loads.
   assert(LI && "Must have a load instruction");
   State.setDebugLocFromInst(LI);
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1901,7 +1901,8 @@
   }
 
   bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+    return isPrefetch() ? getNumOperands() == 5 :
+              isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
   }
 
 public:
@@ -1923,6 +1924,14 @@
     setMask(Mask);
   }
 
+  VPWidenMemoryInstructionRecipe(PrefetchInst &Prefetch, VPValue *Addr, VPValue *Mask,
+                                 bool Consecutive, bool Reverse)
+      : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
+        Ingredient(Prefetch), Consecutive(Consecutive), Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    setMask(Mask);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
 
   /// Return the address accessed by this recipe.
@@ -1940,6 +1949,9 @@
   /// Returns true if this recipe is a store.
   bool isStore() const { return isa<StoreInst>(Ingredient); }
 
+  /// Returns true if this recipe is a prefetch.
+  bool isPrefetch() const { return isa<PrefetchInst>(Ingredient); }
+
   /// Return the address accessed by this recipe.
   VPValue *getStoredValue() const {
     assert(isStore() && "Stored value only available for store instructions");
Index: llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -135,13 +135,17 @@
            "underlying instruction has side-effects");
     return false;
   }
-  case VPWidenMemoryInstructionSC:
+  case VPWidenMemoryInstructionSC: {
+    auto *R = cast<VPWidenMemoryInstructionRecipe>(this);
+    if (isa<PrefetchInst>(R->getIngredient()))
+      return true;
     assert(cast<VPWidenMemoryInstructionRecipe>(this)
                    ->getIngredient()
                    .mayHaveSideEffects() == mayWriteToMemory() &&
            "mayHaveSideffects result for ingredient differs from this "
            "implementation");
     return mayWriteToMemory();
+  }
   case VPReplicateSC: {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
@@ -1081,7 +1085,7 @@
                                            VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
 
-  if (!isStore()) {
+  if (!isStore() && !isPrefetch()) {
     getVPSingleValue()->printAsOperand(O, SlotTracker);
     O << " = ";
   }