diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -991,6 +991,51 @@ return Cost; } + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { + auto *VT = cast(DataTy); + // Assume the target does not have support for gather/scatter operations + // and provide a rough estimate. + // + // First, compute the cost of extracting the individual addresses and the + // individual memory operations. + int LoadCost = + VT->getNumElements() * + (getVectorInstrCost( + Instruction::ExtractElement, + FixedVectorType::get(PointerType::get(VT->getElementType(), 0), + VT->getNumElements()), + -1) + + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); + + // Next, compute the cost of packing the result in a vector. + int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store, + Opcode == Instruction::Store); + + int ConditionalCost = 0; + if (VariableMask) { + // Compute the cost of conditionally executing the memory operations with + // variable masks. This includes extracting the individual conditions, a + // branches and PHIs to combine the results. + // NOTE: Estimating the cost of conditionally executing the memory + // operations accurately is quite difficult and the current solution + // provides a very rough estimate only. + ConditionalCost = + VT->getNumElements() * + (getVectorInstrCost( + Instruction::ExtractElement, + FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), + VT->getNumElements()), + -1) + + getCFInstrCost(Instruction::Br, CostKind) + + getCFInstrCost(Instruction::PHI, CostKind)); + } + + return LoadCost + PackingCost + ConditionalCost; + } + unsigned getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll --- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll +++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll @@ -90,10 +90,10 @@ declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) define <4 x i8> @gather_load_4xi8_constant_mask(<4 x i8*> %ptrs) { ; CHECK: gather_load_4xi8_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) ret <4 x i8> %lv @@ -101,10 +101,10 @@ define <4 x i8> @gather_load_4xi8_variable_mask(<4 x i8*> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi8_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 +; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8 ; %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) ret <4 x i8> %lv @@ -113,10 +113,10 @@ declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32 immarg, <4 x i1>) define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x i8*> %ptrs) { ; CHECK: scatter_store_4xi8_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> ) ret void @@ -124,10 +124,10 @@ define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi8_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( +; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8( ; call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %cond) ret void @@ -136,10 +136,10 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32 immarg, <4 x i1>, <4 x i32>) define <4 x i32> @gather_load_4xi32_constant_mask(<4 x i32*> %ptrs) { ; CHECK: gather_load_4xi32_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> , <4 x i32> undef) ret <4 x i32> %lv @@ -147,10 +147,10 @@ define <4 x i32> @gather_load_4xi32_variable_mask(<4 x i32*> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi32_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 +; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 ; %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) ret <4 x i32> %lv @@ -159,10 +159,10 @@ declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32 immarg, <4 x i1>) define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x i32*> %ptrs) { ; CHECK: scatter_store_4xi32_constant_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> ) ret void @@ -170,10 +170,10 @@ define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi32_variable_mask -; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( -; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( +; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32( ; call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> %cond) ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -12,10 +12,7 @@ ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; REMARK-NEXT: - Cost: '-7' ; -; REMARK-LABEL: Function: gather_load -; REMARK: Args: -; REMARK-NEXT: - String: 'Stores SLP vectorized with cost -; REMARK-NEXT: - Cost: '-2' +; REMARK-NOT: Function: gather_load define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @gather_multiple_use( @@ -61,11 +58,25 @@ define void @gather_load(i16* noalias %ptr) { ; CHECK-LABEL: @gather_load( ; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> , i32 1, <4 x i1> , <4 x i8> undef) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <4 x i16> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX182]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2 +; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3 +; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4 +; CHECK-NEXT: [[L0:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), align 1 +; CHECK-NEXT: [[CONV150:%.*]] = zext i8 [[L0]] to i16 +; CHECK-NEXT: [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10 +; CHECK-NEXT: [[L1:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), align 1 +; CHECK-NEXT: [[CONV156:%.*]] = zext i8 [[L1]] to i16 +; CHECK-NEXT: [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20 +; CHECK-NEXT: [[L2:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), align 1 +; CHECK-NEXT: [[CONV162:%.*]] = zext i8 [[L2]] to i16 +; CHECK-NEXT: [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30 +; CHECK-NEXT: [[L3:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3), align 1 +; CHECK-NEXT: [[CONV168:%.*]] = zext i8 [[L3]] to i16 +; CHECK-NEXT: [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40 +; CHECK-NEXT: store i16 [[ADD152]], i16* [[ARRAYIDX182]], align 2 +; CHECK-NEXT: store i16 [[ADD158]], i16* [[ARRAYIDX183]], align 2 +; CHECK-NEXT: store i16 [[ADD164]], i16* [[ARRAYIDX184]], align 2 +; CHECK-NEXT: store i16 [[ADD170]], i16* [[ARRAYIDX185]], align 2 ; CHECK-NEXT: ret void ; %arrayidx182 = getelementptr inbounds i16, i16* %ptr, i64 1