Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1763,12 +1763,23 @@ return LT.first * 2; } +static unsigned getGatherScatterOverhead(unsigned Opcode, bool UseSVE) { + // TODO: At the moment the SVE cost is applied unilaterally for all CPUs, but + // at some point we may want a per-CPU overhead. + if (Opcode == Instruction::Store) + return UseSVE ? 10 : 4; + else + return UseSVE ? 10 : 2; +} + InstructionCost AArch64TTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { - if (useNeonVector(DataTy)) - return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); + if (useNeonVector(DataTy)) { + InstructionCost Cost = BaseT::getGatherScatterOpCost( + Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); + return Cost * getGatherScatterOverhead(Opcode, false); + } auto *VT = cast(DataTy); auto LT = TLI->getTypeLegalizationCost(DL, DataTy); if (!LT.first.isValid()) @@ -1786,9 +1797,7 @@ InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); // Add on an overhead cost for using gathers/scatters. - // TODO: At the moment this is applied unilaterally for all CPUs, but at some - // point we may want a per-CPU overhead. - MemOpCost *= 10; + MemOpCost *= getGatherScatterOverhead(Opcode, true); return LT.first * MemOpCost * getMaxNumElements(LegalVF); } Index: llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll +++ llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll @@ -90,8 +90,8 @@ declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) define <4 x i8> @gather_load_4xi8_constant_mask(<4 x i8*> %ptrs) { ; CHECK: gather_load_4xi8_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-NEON: Cost Model: Found an estimated cost of 34 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 34 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; @@ -101,8 +101,8 @@ define <4 x i8> @gather_load_4xi8_variable_mask(<4 x i8*> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi8_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-NEON: Cost Model: Found an estimated cost of 58 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 58 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; @@ -113,8 +113,8 @@ declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32 immarg, <4 x i1>) define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x i8*> %ptrs) { ; CHECK: scatter_store_4xi8_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-NEON: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; @@ -124,8 +124,8 @@ define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi8_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-NEON: Cost Model: Found an estimated cost of 116 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 116 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; @@ -136,8 +136,8 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32 immarg, <4 x i1>, <4 x i32>) define <4 x i32> @gather_load_4xi32_constant_mask(<4 x i32*> %ptrs) { ; CHECK: gather_load_4xi32_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-NEON: Cost Model: Found an estimated cost of 34 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 34 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; @@ -147,8 +147,8 @@ define <4 x i32> @gather_load_4xi32_variable_mask(<4 x i32*> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi32_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-NEON: Cost Model: Found an estimated cost of 58 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 58 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; @@ -159,8 +159,8 @@ declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32 immarg, <4 x i1>) define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x i32*> %ptrs) { ; CHECK: scatter_store_4xi32_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-NEON: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; @@ -170,8 +170,8 @@ define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi32_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-NEON: Cost Model: Found an estimated cost of 116 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 116 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; CHECK-SVE-256: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; CHECK-SVE-512: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; @@ -182,8 +182,8 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*>, i32, <256 x i1>, <256 x i16>) define void @sve_gather_vls(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_scatter_vls' -; CHECK-NEON: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-NEON: Cost Model: Found an estimated cost of 3904 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-SVE-128: Cost Model: Found an estimated cost of 3904 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0i16(<256 x i16*> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) entry: @@ -194,8 +194,8 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*>, i32, <256 x i1>, <256 x float>) define void @sve_gather_vls_float(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_gather_vls_float' -; CHECK-NEON: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-NEON: Cost Model: Found an estimated cost of 3712 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-SVE-128: Cost Model: Found an estimated cost of 3712 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0f32(<256 x float*> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) entry: @@ -206,8 +206,8 @@ declare void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8>, <256 x i8*>, i32, <256 x i1>) define void @sve_scatter_vls(<256 x i1> %v256i1mask){ ; CHECK-LABEL: 'sve_scatter_vls' -; CHECK-NEON: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) -; CHECK-SVE-128: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-NEON: Cost Model: Found an estimated cost of 8000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-SVE-128: Cost Model: Found an estimated cost of 8000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-SVE-256: Cost Model: Found an estimated cost of 2560 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-SVE-512: Cost Model: Found an estimated cost of 2560 for instruction: call void @llvm.masked.scatter.v256i8.v256p0i8(<256 x i8> undef, <256 x i8*> undef, i32 0, <256 x i1> %v256i1mask) entry: @@ -218,8 +218,8 @@ declare void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half>, <512 x half*>, i32, <512 x i1>) define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){ ; CHECK-LABEL: 'sve_scatter_vls_float' -; CHECK-NEON: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) -; CHECK-SVE-128: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-NEON: Cost Model: Found an estimated cost of 15616 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-SVE-128: Cost Model: Found an estimated cost of 15616 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-SVE-256: Cost Model: Found an estimated cost of 5120 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-SVE-512: Cost Model: Found an estimated cost of 5120 for instruction: call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask) call void @llvm.masked.scatter.v512f16.v512p0f16(<512 x half> undef, <512 x half*> undef, i32 0, <512 x i1> %v512i1mask)