Index: ../include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- ../include/llvm/Analysis/TargetTransformInfo.h +++ ../include/llvm/Analysis/TargetTransformInfo.h @@ -458,6 +458,15 @@ int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of Gather or Scatter operation + /// \p Opcode - is a type of memory access Load or Store + /// \p DataTy - a vector type of the data to be loaded or stored + /// \p Ptr - pointer [or vector of pointers] - address[es] in memory + /// \p VariableMask - true when the memory access is predicated + /// \p Alignment - alignment of single element + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment) const; + /// \return The cost of the interleaved memory operation. /// \p Opcode is the memory operation code /// \p VecTy is the vector type of the interleaved access. @@ -485,10 +494,14 @@ /// ((v0+v2), (v1+v3), undef, undef) int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; - /// \returns The cost of Intrinsic instructions. + /// \returns The cost of Intrinsic instructions. Types analysis only. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const; + /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) const; + /// \returns The cost of Call instructions. int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const; @@ -614,6 +627,9 @@ virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) = 0; virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -623,6 +639,8 @@ bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) = 0; + virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -791,6 +809,12 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) override { + return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); + } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) override { @@ -805,6 +829,10 @@ ArrayRef Tys) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys); } + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Args); + } int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) override { return Impl.getCallInstrCost(F, RetTy, Tys); Index: ../include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- ../include/llvm/Analysis/TargetTransformInfoImpl.h +++ ../include/llvm/Analysis/TargetTransformInfoImpl.h @@ -301,6 +301,12 @@ return 1; } + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, + unsigned Alignment) { + return 1; + } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -313,6 +319,10 @@ ArrayRef Tys) { return 1; } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) { + return 1; + } unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) { return 1; Index: ../include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- ../include/llvm/CodeGen/BasicTTIImpl.h +++ ../include/llvm/CodeGen/BasicTTIImpl.h @@ -580,6 +580,39 @@ return Cost; } + /// Get intrinsic cost based on arguments + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, + ArrayRef Args) { + switch (IID) { + default: { + SmallVector Types; + for (Value *Op : Args) + Types.push_back(Op->getType()); + return getIntrinsicInstrCost(IID, RetTy, Types); + } + case Intrinsic::masked_scatter: { + Value *Mask = Args[3]; + bool VarMask = !isa(Mask); + unsigned Alignment = cast(Args[2])->getZExtValue(); + return + static_cast(this)->getGatherScatterOpCost(Instruction::Store, + Args[0]->getType(), + Args[1], VarMask, + Alignment); + } + case Intrinsic::masked_gather: { + Value *Mask = Args[2]; + bool VarMask = !isa(Mask); + unsigned Alignment = cast(Args[1])->getZExtValue(); + return + static_cast(this)->getGatherScatterOpCost(Instruction::Load, + RetTy, Args[0], VarMask, + Alignment); + } + } + } + + /// Get intrinsic cost based on argument types unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; Index: ../lib/Analysis/CostModel.cpp =================================================================== --- ../lib/Analysis/CostModel.cpp +++ ../lib/Analysis/CostModel.cpp @@ -500,12 +500,12 @@ } case Instruction::Call: if (const IntrinsicInst *II = dyn_cast(I)) { - SmallVector Tys; + SmallVector Args; for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) - Tys.push_back(II->getArgOperand(J)->getType()); + Args.push_back(II->getArgOperand(J)); return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), - Tys); + Args); } return -1; default: Index: ../lib/Analysis/TargetTransformInfo.cpp =================================================================== --- ../lib/Analysis/TargetTransformInfo.cpp +++ ../lib/Analysis/TargetTransformInfo.cpp @@ -280,6 +280,15 @@ return Cost; } +int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) const { + int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) const { @@ -296,6 +305,13 @@ return Cost; } +int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const { int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys); Index: ../lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.h +++ ../lib/Target/X86/X86TargetTransformInfo.h @@ -76,7 +76,8 @@ unsigned AddressSpace); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); - + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool Masked, unsigned Alignment); int getAddressComputationCost(Type *PtrTy, bool IsComplex); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); @@ -94,6 +95,11 @@ bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); /// @} }; Index: ../lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.cpp +++ ../lib/Target/X86/X86TargetTransformInfo.cpp @@ -569,6 +569,7 @@ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, @@ -1297,6 +1298,140 @@ return X86TTIImpl::getIntImmCost(Imm, Ty); } +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation 16 x 64 does not fit in zmm and should be split. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast(Ptr); + if (IndexSize < 64 && !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one intruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// Masked - The operation has non-constant mask. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool Masked, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (Masked) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } + + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector 4 gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we decide + // to scalarize vector-4 for KNL. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? Index: ../test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- ../test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ ../test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2 +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2 +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX ; AVX2-LABEL: test1 @@ -65,6 +67,217 @@ ret <2 x i32> %res } +define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) { + +; AVX2-LABEL: test_gather_2f64 +; AVX2: Found an estimated cost of 7 {{.*}}.gather + +; KNL-LABEL: test_gather_2f64 +; KNL: Found an estimated cost of 7 {{.*}}.gather + +; SKX-LABEL: test_gather_2f64 +; SKX: Found an estimated cost of 7 {{.*}}.gather + +%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} +declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) + +define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { + +; AVX2-LABEL: test_gather_4i32 +; AVX2: Found an estimated cost of 16 {{.*}}.gather + +; KNL-LABEL: test_gather_4i32 +; KNL: Found an estimated cost of 16 {{.*}}.gather + +; SKX-LABEL: test_gather_4i32 +; SKX: Found an estimated cost of 6 {{.*}}.gather + +%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) { + +; AVX2-LABEL: test_gather_4i32_const_mask +; AVX2: Found an estimated cost of 8 {{.*}}.gather + +; KNL-LABEL: test_gather_4i32_const_mask +; KNL: Found an estimated cost of 8 {{.*}}.gather + +; SKX-LABEL: test_gather_4i32_const_mask +; SKX: Found an estimated cost of 6 {{.*}}.gather + +%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0) + +define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) { + +; AVX2-LABEL: test_gather_16f32_const_mask +; AVX2: Found an estimated cost of 30 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_const_mask +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_const_mask +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) { + +; AVX2-LABEL: test_gather_16f32_var_mask +; AVX2: Found an estimated cost of 62 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_var_mask +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_var_mask +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { + +; AVX2-LABEL: test_gather_16f32_ra_var_mask +; AVX2: Found an estimated cost of 62 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_ra_var_mask +; KNL: Found an estimated cost of 20 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_ra_var_mask +; SKX: Found an estimated cost of 20 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) { + +; AVX2-LABEL: test_gather_16f32_const_mask2 +; AVX2: Found an estimated cost of 30 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_const_mask2 +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_const_mask2 +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 + %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; AVX2-LABEL: test_scatter_16i32 +; AVX2: Found an estimated cost of 64 {{.*}}.scatter + +; KNL-LABEL: test_scatter_16i32 +; KNL: Found an estimated cost of 18 {{.*}}.scatter + +; SKX-LABEL: test_scatter_16i32 +; SKX: Found an estimated cost of 18 {{.*}}.scatter + + %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 + %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) { +; AVX2-LABEL: test_scatter_8i32 +; AVX2: Found an estimated cost of 32 {{.*}}.scatter + +; KNL-LABEL: test_scatter_8i32 +; KNL: Found an estimated cost of 10 {{.*}}.scatter + +; SKX-LABEL: test_scatter_8i32 +; SKX: Found an estimated cost of 10 {{.*}}.scatter + + call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask) + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { +; AVX2-LABEL: test_scatter_4i32 +; AVX2: Found an estimated cost of 16 {{.*}}.scatter + +; KNL-LABEL: test_scatter_4i32 +; KNL: Found an estimated cost of 16 {{.*}}.scatter + +; SKX-LABEL: test_scatter_4i32 +; SKX: Found an estimated cost of 6 {{.*}}.scatter + + call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) { + +; AVX2-LABEL: test_gather_4f32 +; AVX2: Found an estimated cost of 15 {{.*}}.gather + +; KNL-LABEL: test_gather_4f32 +; KNL: Found an estimated cost of 15 {{.*}}.gather + +; SKX-LABEL: test_gather_4f32 +; SKX: Found an estimated cost of 6 {{.*}}.gather + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) { + +; AVX2-LABEL: test_gather_4f32_const_mask +; AVX2: Found an estimated cost of 7 {{.*}}.gather + +; KNL-LABEL: test_gather_4f32_const_mask +; KNL: Found an estimated cost of 7 {{.*}}.gather + +; SKX-LABEL: test_gather_4f32_const_mask +; SKX: Found an estimated cost of 6 {{.*}}.gather + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> ) +declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask) +declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) +declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)