Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1763,12 +1763,23 @@
   return LT.first * 2;
 }
 
+static unsigned getGatherScatterOverhead(unsigned Opcode, bool UseSVE) {
+  // TODO: At the moment the SVE cost is applied unilaterally for all CPUs, but
+  // at some point we may want a per-CPU overhead.
+  if (Opcode == Instruction::Store)
+    return UseSVE ? 10 : 4;
+  else
+    return UseSVE ? 10 : 2;
+}
+
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
-  if (useNeonVector(DataTy))
-    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                         Alignment, CostKind, I);
+  if (useNeonVector(DataTy)) {
+    InstructionCost Cost = BaseT::getGatherScatterOpCost(
+        Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I);
+    return Cost * getGatherScatterOverhead(Opcode, false);
+  }
   auto *VT = cast<VectorType>(DataTy);
   auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
   if (!LT.first.isValid())
@@ -1786,9 +1797,7 @@
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
   // Add on an overhead cost for using gathers/scatters.
-  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
-  // point we may want a per-CPU overhead.
-  MemOpCost *= 10;
+  MemOpCost *= getGatherScatterOverhead(Opcode, true);
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
Index: llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
===================================================================
--- llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -90,8 +90,8 @@
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>)
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x i8*> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON:    Cost Model: Found an estimated cost of 34 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 34 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ;
@@ -101,8 +101,8 @@
 
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x i8*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON:    Cost Model: Found an estimated cost of 58 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 58 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
 ;
@@ -113,8 +113,8 @@
 declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32 immarg, <4 x i1>)
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x i8*> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 68 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 68 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ;
@@ -124,8 +124,8 @@
 
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 116 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 116 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i8.v4p0i8(
 ;
@@ -136,8 +136,8 @@
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32 immarg, <4 x i1>, <4 x i32>)
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x i32*> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON:    Cost Model: Found an estimated cost of 34 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 34 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ;
@@ -147,8 +147,8 @@
 
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x i32*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON:    Cost Model: Found an estimated cost of 58 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 58 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
 ;
@@ -159,8 +159,8 @@
 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32 immarg, <4 x i1>)
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x i32*> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 68 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 68 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ;
@@ -170,8 +170,8 @@
 
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
-; CHECK-NEON:    Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON:    Cost Model: Found an estimated cost of 116 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 116 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction:  call void @llvm.masked.scatter.v4i32.v4p0i32(
 ;
@@ -182,8 +182,8 @@
 declare <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*>, i32, <256 x i1>, <256 x i16>)
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
-; CHECK-NEON: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON: Cost Model: Found an estimated cost of 3904 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 3904 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 entry:
@@ -194,8 +194,8 @@
 declare <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*>, i32, <256 x i1>, <256 x float>)
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON: Cost Model: Found an estimated cost of 3712 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 3712 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 entry:
@@ -206,8 +206,8 @@
 declare void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8>, <256 x i8*>, i32, <256 x i1>)
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
-; CHECK-NEON: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON: Cost Model: Found an estimated cost of 8000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 8000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask)
 entry:
@@ -218,8 +218,8 @@
 declare void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half>, <512 x half*>, i32, <512 x i1>)
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON: Cost Model: Found an estimated cost of 15616 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 15616 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 5120 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 5120 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)
   call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)