diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -189,6 +189,55 @@ llvm_unreachable("Unexpected MemIndexedMode"); } + InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, + Align Alignment, + bool VariableMask, + bool IsGatherScatter, + TTI::TargetCostKind CostKind) { + auto *VT = cast(DataTy); + // Assume the target does not have support for gather/scatter operations + // and provide a rough estimate. + // + // First, compute the cost of the individual memory operations. + InstructionCost AddrExtractCost = + IsGatherScatter + ? getVectorInstrCost(Instruction::ExtractElement, + FixedVectorType::get( + PointerType::get(VT->getElementType(), 0), + VT->getNumElements()), + -1) + : 0; + InstructionCost LoadCost = + VT->getNumElements() * + (AddrExtractCost + + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); + + // Next, compute the cost of packing the result in a vector. + int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store, + Opcode == Instruction::Store); + + InstructionCost ConditionalCost = 0; + if (VariableMask) { + // Compute the cost of conditionally executing the memory operations with + // variable masks. This includes extracting the individual conditions, a + // branches and PHIs to combine the results. + // NOTE: Estimating the cost of conditionally executing the memory + // operations accurately is quite difficult and the current solution + // provides a very rough estimate only. + ConditionalCost = + VT->getNumElements() * + (getVectorInstrCost( + Instruction::ExtractElement, + FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), + VT->getNumElements()), + -1) + + getCFInstrCost(Instruction::Br, CostKind) + + getCFInstrCost(Instruction::PHI, CostKind)); + } + + return LoadCost + PackingCost + ConditionalCost; + } + protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} @@ -1024,50 +1073,20 @@ return Cost; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind) { + return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, + CostKind); + } + InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { - auto *VT = cast(DataTy); - // Assume the target does not have support for gather/scatter operations - // and provide a rough estimate. - // - // First, compute the cost of extracting the individual addresses and the - // individual memory operations. - InstructionCost LoadCost = - VT->getNumElements() * - (getVectorInstrCost( - Instruction::ExtractElement, - FixedVectorType::get(PointerType::get(VT->getElementType(), 0), - VT->getNumElements()), - -1) + - getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); - - // Next, compute the cost of packing the result in a vector. - int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store, - Opcode == Instruction::Store); - - InstructionCost ConditionalCost = 0; - if (VariableMask) { - // Compute the cost of conditionally executing the memory operations with - // variable masks. This includes extracting the individual conditions, a - // branches and PHIs to combine the results. - // NOTE: Estimating the cost of conditionally executing the memory - // operations accurately is quite difficult and the current solution - // provides a very rough estimate only. - ConditionalCost = - VT->getNumElements() * - (getVectorInstrCost( - Instruction::ExtractElement, - FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), - VT->getNumElements()), - -1) + - getCFInstrCost(Instruction::Br, CostKind) + - getCFInstrCost(Instruction::PHI, CostKind)); - } - - return LoadCost + PackingCost + ConditionalCost; + return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, + true, CostKind); } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -133,6 +133,10 @@ unsigned getMaxInterleaveFactor(unsigned VF); + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind); + InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1038,6 +1038,17 @@ return Options; } +InstructionCost +AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind) { + if (!isa(Src)) + return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + auto LT = TLI->getTypeLegalizationCost(DL, Src); + return LT.first * 2; +} + InstructionCost AArch64TTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +define void @fixed() { +; CHECK-LABEL: 'fixed' +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 8, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 8, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + ; Legal fixed-width integer types + %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8> *undef, i32 8, <2 x i1> undef, <2 x i8> undef) + %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8> *undef, i32 8, <4 x i1> undef, <4 x i8> undef) + %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8> *undef, i32 8, <8 x i1> undef, <8 x i8> undef) + %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8> *undef, i32 8, <16 x i1> undef, <16 x i8> undef) + %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16> *undef, i32 8, <2 x i1> undef, <2 x i16> undef) + %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16> *undef, i32 8, <4 x i1> undef, <4 x i16> undef) + %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16> *undef, i32 8, <8 x i1> undef, <8 x i16> undef) + %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32> *undef, i32 8, <2 x i1> undef, <2 x i32> undef) + %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32> *undef, i32 8, <4 x i1> undef, <4 x i32> undef) + %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64> *undef, i32 8, <2 x i1> undef, <2 x i64> undef) + + ; Legal fixed-width floating point types + %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half> *undef, i32 8, <2 x i1> undef, <2 x half> undef) + %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half> *undef, i32 8, <4 x i1> undef, <4 x half> undef) + %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half> *undef, i32 8, <8 x i1> undef, <8 x half> undef) + %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float> *undef, i32 8, <2 x i1> undef, <2 x float> undef) + %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float> *undef, i32 8, <4 x i1> undef, <4 x float> undef) + %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 8, <2 x i1> undef, <2 x double> undef) + + ; A couple of examples of illegal fixed-width types + %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64> *undef, i32 8, <4 x i1> undef, <4 x i64> undef) + %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef) + + ret void +} + + +define void @scalable() { +; CHECK-LABEL: 'scalable' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8 = call @llvm.masked.load.nxv2i8.p0nxv2i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8 = call @llvm.masked.load.nxv4i8.p0nxv4i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8 = call @llvm.masked.load.nxv8i8.p0nxv8i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call @llvm.masked.load.nxv16i8.p0nxv16i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16 = call @llvm.masked.load.nxv2i16.p0nxv2i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16 = call @llvm.masked.load.nxv4i16.p0nxv4i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call @llvm.masked.load.nxv8i16.p0nxv8i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32 = call @llvm.masked.load.nxv2i32.p0nxv2i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call @llvm.masked.load.nxv4i32.p0nxv4i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call @llvm.masked.load.nxv2i64.p0nxv2i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = call @llvm.masked.load.nxv2f16.p0nxv2f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = call @llvm.masked.load.nxv4f16.p0nxv4f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = call @llvm.masked.load.nxv8f16.p0nxv8f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32 = call @llvm.masked.load.nxv2f32.p0nxv2f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call @llvm.masked.load.nxv4f32.p0nxv4f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + ; Legal scalable integer types + %nxv2i8 = call @llvm.masked.load.nxv2i8.p0nxv2i8( *undef, i32 8, undef, undef) + %nxv4i8 = call @llvm.masked.load.nxv4i8.p0nxv4i8( *undef, i32 8, undef, undef) + %nxv8i8 = call @llvm.masked.load.nxv8i8.p0nxv8i8( *undef, i32 8, undef, undef) + %nxv16i8 = call @llvm.masked.load.nxv16i8.p0nxv16i8( *undef, i32 8, undef, undef) + %nxv2i16 = call @llvm.masked.load.nxv2i16.p0nxv2i16( *undef, i32 8, undef, undef) + %nxv4i16 = call @llvm.masked.load.nxv4i16.p0nxv4i16( *undef, i32 8, undef, undef) + %nxv8i16 = call @llvm.masked.load.nxv8i16.p0nxv8i16( *undef, i32 8, undef, undef) + %nxv2i32 = call @llvm.masked.load.nxv2i32.p0nxv2i32( *undef, i32 8, undef, undef) + %nxv4i32 = call @llvm.masked.load.nxv4i32.p0nxv4i32( *undef, i32 8, undef, undef) + %nxv2i64 = call @llvm.masked.load.nxv2i64.p0nxv2i64( *undef, i32 8, undef, undef) + + ; Legal scalable floating point types + %nxv2f16 = call @llvm.masked.load.nxv2f16.p0nxv2f16( *undef, i32 8, undef, undef) + %nxv4f16 = call @llvm.masked.load.nxv4f16.p0nxv4f16( *undef, i32 8, undef, undef) + %nxv8f16 = call @llvm.masked.load.nxv8f16.p0nxv8f16( *undef, i32 8, undef, undef) + %nxv2f32 = call @llvm.masked.load.nxv2f32.p0nxv2f32( *undef, i32 8, undef, undef) + %nxv4f32 = call @llvm.masked.load.nxv4f32.p0nxv4f32( *undef, i32 8, undef, undef) + %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64( *undef, i32 8, undef, undef) + + ; A couple of examples of illegal scalable types + %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64( *undef, i32 8, undef, undef) + %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16( *undef, i32 8, undef, undef) + + ret void +} + +declare <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>*, i32, <2 x i1>, <2 x i8>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>*, i32, <2 x i1>, <2 x i16>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) +declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) +declare <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>) +declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) +declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) + + +declare @llvm.masked.load.nxv2i8.p0nxv2i8(*, i32, , ) +declare @llvm.masked.load.nxv4i8.p0nxv4i8(*, i32, , ) +declare @llvm.masked.load.nxv8i8.p0nxv8i8(*, i32, , ) +declare @llvm.masked.load.nxv16i8.p0nxv16i8(*, i32, , ) +declare @llvm.masked.load.nxv2i16.p0nxv2i16(*, i32, , ) +declare @llvm.masked.load.nxv4i16.p0nxv4i16(*, i32, , ) +declare @llvm.masked.load.nxv8i16.p0nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv2i32.p0nxv2i32(*, i32, , ) +declare @llvm.masked.load.nxv4i32.p0nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv2i64.p0nxv2i64(*, i32, , ) +declare @llvm.masked.load.nxv4i64.p0nxv4i64(*, i32, , ) +declare @llvm.masked.load.nxv2f16.p0nxv2f16(*, i32, , ) +declare @llvm.masked.load.nxv4f16.p0nxv4f16(*, i32, , ) +declare @llvm.masked.load.nxv8f16.p0nxv8f16(*, i32, , ) +declare @llvm.masked.load.nxv32f16.p0nxv32f16(*, i32, , ) +declare @llvm.masked.load.nxv2f32.p0nxv2f32(*, i32, , ) +declare @llvm.masked.load.nxv4f32.p0nxv4f32(*, i32, , ) +declare @llvm.masked.load.nxv2f64.p0nxv2f64(*, i32, , ) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -0,0 +1,92 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -force-vector-interleave=1 -S -debug < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST + +target triple = "aarch64-unknown-linux-gnu" + +; CHECK-COST: Checking a loop in "fixed_width" +; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction: store i32 2, i32* %arrayidx1, align 4 +; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction: store i32 2, i32* %arrayidx1, align 4 +; CHECK-COST: Selecting VF: 1. + +; We should decide this loop is not worth vectorising using fixed width vectors +define void @fixed_width(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @fixed_width( +; CHECK-NOT: vector.body +entry: + %cmp6 = icmp sgt i64 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.07 + store i32 2, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + + +; CHECK-COST: Checking a loop in "scalable" +; CHECK-COST: Found an estimated cost of 2 for VF vscale x 4 For instruction: store i32 2, i32* %arrayidx1, align 4 + +define void @scalable(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @scalable( +; CHECK: vector.body +; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32 +entry: + %cmp6 = icmp sgt i64 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.07 + store i32 2, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +attributes #0 = { "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -128,8 +128,9 @@ attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"} -!0 = distinct !{!0, !1, !2, !3, !4} +!0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} !4 = !{!"llvm.loop.vectorize.enable", i1 true} +!5 = !{!"llvm.loop.interleave.count", i32 2}