Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -129,6 +129,19 @@
                                   MaybeAlign Alignment, unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
+  InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                         const Value *Ptr, bool VariableMask,
+                                         Align Alignment,
+                                         TTI::TargetCostKind CostKind,
+                                         const Instruction *I);
+  bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
+  bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+    return forceScalarizeMaskedGather(VTy, Alignment);
+  }
+  bool isLegalMaskedGather(Type *DataType, Align Alignment);
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
+    return isLegalMaskedGather(DataType, Alignment);
+  }
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -24,6 +24,12 @@
 using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
+static cl::opt<bool> EnableLegalMaskedGather("ppc-enable-gather", cl::Hidden,
+                                             cl::init(false),
+                                             cl::desc("Enable masked gather"));
+static cl::opt<unsigned>
+    ForceGatherCost("force-gather-cost", cl::Hidden, cl::init(0),
+                    cl::desc("For debugging, sets user specified cost"));
 
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
@@ -1238,6 +1244,45 @@
   return Cost;
 }
 
+bool PPCTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+  return EnableLegalMaskedGather;
+}
+
+bool PPCTTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+  if (!EnableLegalMaskedGather)
+    return false;
+  unsigned EltWidth = DataTy->getScalarSizeInBits();
+  return ((EltWidth == 64 && Alignment >= 8) ||
+          (EltWidth == 32 && Alignment >= 4) ||
+          (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
+}
+
+/// Calculate the cost of Gather/Scatter operation
+InstructionCost PPCTTIImpl::getGatherScatterOpCost(
+    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+    Align Alignment, TTI::TargetCostKind CostKind,
+    const Instruction *I = nullptr) {
+  if (!EnableLegalMaskedGather)
+    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                         Alignment, CostKind, I);
+
+  if (ForceGatherCost.getNumOccurrences() > 0)
+    return InstructionCost(ForceGatherCost);
+
+  assert(DataTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+  auto *VTy = cast<FixedVectorType>(DataTy);
+
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
+
+  // We are currently using forceScalarizeMaskedGather to lower the
+  // gather/scatter instructions. So the cost will be the cost of scalarizing.
+  unsigned NumElems = VTy->getNumElements();
+  InstructionCost Cost =
+      NumElems * LT.first + getScalarizationOverhead(VTy, true, false);
+
+  return Cost;
+}
+
 InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
Index: llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-gather.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-gather.ll
@@ -0,0 +1,51 @@
+; RUN: opt -S -loop-vectorize -mtriple=powerpc64le-unknown-unknown \
+; RUN: -ppc-enable-gather=true -force-gather-cost=1 -mcpu=pwr9 < %s | FileCheck %s
+
+; CHECK-LABEL: @vmul
+; CHECK: vector.body:
+; CHECK: call <2 x double> @llvm.masked.gather
+define dso_local noundef double @vmul(i32* noundef readonly %rowstart, i32* noundef readnone %rowend, double* nocapture noundef readonly %luval, double* nocapture noundef readonly %dst) {
+entry:
+  %cmp.not8 = icmp eq i32* %rowstart, %rowend
+  br i1 %cmp.not8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %col.011 = phi i32* [ %incdec.ptr, %for.body ], [ %rowstart, %for.body.preheader ]
+  %res.010 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %luval.addr.09 = phi double* [ %incdec.ptr1, %for.body ], [ %luval, %for.body.preheader ]
+  %0 = load double, double* %luval.addr.09, align 8, !tbaa !3
+  %1 = load i32, i32* %col.011, align 4, !tbaa !7
+  %idxprom = zext i32 %1 to i64
+  %arrayidx = getelementptr inbounds double, double* %dst, i64 %idxprom
+  %2 = load double, double* %arrayidx, align 8, !tbaa !3
+  %mul = fmul fast double %2, %0
+  %add = fadd fast double %mul, %res.010
+  %incdec.ptr = getelementptr inbounds i32, i32* %col.011, i64 1
+  %incdec.ptr1 = getelementptr inbounds double, double* %luval.addr.09, i64 1
+  %cmp.not = icmp eq i32* %incdec.ptr, %rowend
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"double", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-scatter.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-scatter.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-vectorize -mtriple=powerpc64le-unknown-unknown \
+; RUN: -ppc-enable-gather=true -force-gather-cost=1 -mcpu=pwr9 < %s | FileCheck %s
+
+; CHECK-LABEL: entry:
+; CHECK: vector.body:
+; CHECK: call void @llvm.masked.scatter
+define dso_local void @test(float* noalias nocapture noundef readonly %in, float* noalias nocapture noundef writeonly %out, i32* noalias nocapture noundef readonly %index, i32 noundef signext %SIZE) {
+entry:
+  %cmp9 = icmp sgt i32 %SIZE, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %SIZE to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !tbaa !3
+  %add = fadd fast float %0, 5.000000e-01
+  %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !tbaa !7
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds float, float* %out, i64 %idxprom3
+  store float %add, float* %arrayidx4, align 4, !tbaa !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"float", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: llvm/test/Transforms/SLPVectorizer/PowerPC/slp-masked-gather.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SLPVectorizer/PowerPC/slp-masked-gather.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer -ppc-enable-gather=true -force-gather-cost=1 < %s | FileCheck %s
+
+; CHECK-LABEL: entry:
+; CHECK: call <4 x i32> @llvm.masked.gather
+define dso_local void @test(i32** noalias nocapture noundef readonly %a, i32* noalias nocapture noundef writeonly %b) {
+entry:
+  %arrayidx = getelementptr inbounds i32*, i32** %a, i64 5
+  %0 = load i32*, i32** %arrayidx, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 18
+  %1 = load i32, i32* %arrayidx1, align 4, !tbaa !7
+  %sub = add nsw i32 %1, -10
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 1
+  store i32 %sub, i32* %arrayidx2, align 4, !tbaa !7
+  %arrayidx4 = getelementptr inbounds i32, i32* %0, i64 27
+  %2 = load i32, i32* %arrayidx4, align 4, !tbaa !7
+  %sub5 = add nsw i32 %2, -10
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 2
+  store i32 %sub5, i32* %arrayidx6, align 4, !tbaa !7
+  %arrayidx8 = getelementptr inbounds i32, i32* %0, i64 36
+  %3 = load i32, i32* %arrayidx8, align 4, !tbaa !7
+  %sub9 = add nsw i32 %3, -10
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 3
+  store i32 %sub9, i32* %arrayidx10, align 4, !tbaa !7
+  %arrayidx12 = getelementptr inbounds i32, i32* %0, i64 45
+  %4 = load i32, i32* %arrayidx12, align 4, !tbaa !7
+  %sub13 = add nsw i32 %4, -10
+  %arrayidx14 = getelementptr inbounds i32, i32* %b, i64 4
+  store i32 %sub13, i32* %arrayidx14, align 4, !tbaa !7
+  %arrayidx16 = getelementptr inbounds i32, i32* %0, i64 54
+  %5 = load i32, i32* %arrayidx16, align 4, !tbaa !7
+  %sub17 = add nsw i32 %5, -10
+  %arrayidx18 = getelementptr inbounds i32, i32* %b, i64 5
+  store i32 %sub17, i32* %arrayidx18, align 4, !tbaa !7
+  %arrayidx20 = getelementptr inbounds i32, i32* %0, i64 63
+  %6 = load i32, i32* %arrayidx20, align 4, !tbaa !7
+  %sub21 = add nsw i32 %6, -10
+  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 6
+  store i32 %sub21, i32* %arrayidx22, align 4, !tbaa !7
+  %arrayidx24 = getelementptr inbounds i32, i32* %0, i64 72
+  %7 = load i32, i32* %arrayidx24, align 4, !tbaa !7
+  %sub25 = add nsw i32 %7, -10
+  %arrayidx26 = getelementptr inbounds i32, i32* %b, i64 7
+  store i32 %sub25, i32* %arrayidx26, align 4, !tbaa !7
+  %arrayidx28 = getelementptr inbounds i32, i32* %0, i64 81
+  %8 = load i32, i32* %arrayidx28, align 4, !tbaa !7
+  %sub29 = add nsw i32 %8, -10
+  %arrayidx30 = getelementptr inbounds i32, i32* %b, i64 8
+  store i32 %sub29, i32* %arrayidx30, align 4, !tbaa !7
+  ret void
+}
+
+!llvm.module.flags = !{!0, !1} 
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}