diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -991,6 +991,51 @@
     return Cost;
   }
 
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr) {
+    auto *VT = cast<FixedVectorType>(DataTy);
+    // Assume the target does not have support for gather/scatter operations
+    // and provide a rough estimate.
+    //
+    // First, compute the cost of extracting the individual addresses and the
+    // individual memory operations.
+    int LoadCost =
+        VT->getNumElements() *
+        (getVectorInstrCost(
+             Instruction::ExtractElement,
+             FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
+                                  VT->getNumElements()),
+             -1) +
+         getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
+
+    // Next, compute the cost of packing the result in a vector.
+    int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
+                                               Opcode == Instruction::Store);
+
+    int ConditionalCost = 0;
+    if (VariableMask) {
+      // Compute the cost of conditionally executing the memory operations with
+      // variable masks. This includes extracting the individual conditions, a
+      // branches and PHIs to combine the results.
+      // NOTE: Estimating the cost of conditionally executing the memory
+      // operations accurately is quite difficult and the current solution
+      // provides a very rough estimate only.
+      ConditionalCost =
+          VT->getNumElements() *
+          (getVectorInstrCost(
+               Instruction::ExtractElement,
+               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
+                                    VT->getNumElements()),
+               -1) +
+           getCFInstrCost(Instruction::Br, CostKind) +
+           getCFInstrCost(Instruction::PHI, CostKind));
+    }
+
+    return LoadCost + PackingCost + ConditionalCost;
+  }
+
   unsigned getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -90,10 +90,10 @@
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>)
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x i8*> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
   ret <4 x i8> %lv
@@ -101,10 +101,10 @@
 
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x i8*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
   ret <4 x i8> %lv
@@ -113,10 +113,10 @@
 declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32 immarg, <4 x i1>)
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x i8*> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ;
   call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret void
@@ -124,10 +124,10 @@
 
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ;
   call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %cond)
   ret void
@@ -136,10 +136,10 @@
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32 immarg, <4 x i1>, <4 x i32>)
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x i32*> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %lv
@@ -147,10 +147,10 @@
 
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x i32*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
   ret <4 x i32> %lv
@@ -159,10 +159,10 @@
 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32 immarg, <4 x i1>)
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x i32*> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ;
   call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret void
@@ -170,10 +170,10 @@
 
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ;
   call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> %cond)
   ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -12,10 +12,7 @@
 ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
 ; REMARK-NEXT:    - Cost: '-7'
 ;
-; REMARK-LABEL: Function: gather_load
-; REMARK:       Args:
-; REMARK-NEXT:    - String: 'Stores SLP vectorized with cost
-; REMARK-NEXT:    - Cost: '-2'
+; REMARK-NOT: Function: gather_load
 
 define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @gather_multiple_use(
@@ -61,11 +58,25 @@
 define void @gather_load(i16* noalias %ptr) {
 ; CHECK-LABEL: @gather_load(
 ; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> <i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3)>, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i16> [[TMP2]], <i16 10, i16 20, i16 30, i16 40>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX182]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[TMP3]], <4 x i16>* [[TMP4]], align 2
+; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4
+; CHECK-NEXT:    [[L0:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), align 1
+; CHECK-NEXT:    [[CONV150:%.*]] = zext i8 [[L0]] to i16
+; CHECK-NEXT:    [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
+; CHECK-NEXT:    [[L1:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), align 1
+; CHECK-NEXT:    [[CONV156:%.*]] = zext i8 [[L1]] to i16
+; CHECK-NEXT:    [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20
+; CHECK-NEXT:    [[L2:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), align 1
+; CHECK-NEXT:    [[CONV162:%.*]] = zext i8 [[L2]] to i16
+; CHECK-NEXT:    [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30
+; CHECK-NEXT:    [[L3:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3), align 1
+; CHECK-NEXT:    [[CONV168:%.*]] = zext i8 [[L3]] to i16
+; CHECK-NEXT:    [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40
+; CHECK-NEXT:    store i16 [[ADD152]], i16* [[ARRAYIDX182]], align 2
+; CHECK-NEXT:    store i16 [[ADD158]], i16* [[ARRAYIDX183]], align 2
+; CHECK-NEXT:    store i16 [[ADD164]], i16* [[ARRAYIDX184]], align 2
+; CHECK-NEXT:    store i16 [[ADD170]], i16* [[ARRAYIDX185]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx182 = getelementptr inbounds i16, i16* %ptr, i64 1