Index: ../include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- ../include/llvm/Analysis/TargetTransformInfo.h +++ ../include/llvm/Analysis/TargetTransformInfo.h @@ -458,6 +458,15 @@ int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of Gather or Scatter operation + /// \p Opcode - is a type of memory access Load or Store + /// \p DataTy - a vector type of the data to be loaded or stored + /// \Ptr - poiner [or vector of pointers] - address[es] in memory + /// \p Masked - true when the memory access is predicated + /// \p Alignment - alignment of single element + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool Masked, unsigned Alignment) const; + /// \return The cost of the interleaved memory operation. /// \p Opcode is the memory operation code /// \p VecTy is the vector type of the interleaved access. @@ -614,6 +623,9 @@ virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool Masked, + unsigned Alignment) = 0; virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -791,6 +803,11 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool Masked, + unsigned Alignment) override { + return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, Masked, Alignment); + } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) override { Index: ../include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- ../include/llvm/Analysis/TargetTransformInfoImpl.h +++ ../include/llvm/Analysis/TargetTransformInfoImpl.h @@ -304,6 +304,12 @@ return 1; } + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, bool Masked, + unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Index: ../include/llvm/IR/IRBuilder.h =================================================================== --- ../include/llvm/IR/IRBuilder.h +++ ../include/llvm/IR/IRBuilder.h @@ -432,6 +432,14 @@ CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask); + /// \brief Create a call to Masked Gather intrinsic + CallInst *CreateMaskedGather(Value *Ptrs, unsigned Align, Value *Mask, + Value *PassThru = 0, const Twine& Name = ""); + + /// \brief Create a call to Masked Scatter intrinsic + CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align, + Value *Mask = 0); + /// \brief Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. CallInst *CreateAssumption(Value *Cond); Index: ../lib/Analysis/TargetTransformInfo.cpp =================================================================== --- ../lib/Analysis/TargetTransformInfo.cpp +++ ../lib/Analysis/TargetTransformInfo.cpp @@ -280,6 +280,15 @@ return Cost; } +int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool Masked, + unsigned Alignment) const { + int Cost = + TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, Masked, Alignment); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) const { Index: ../lib/IR/IRBuilder.cpp =================================================================== --- ../lib/IR/IRBuilder.cpp +++ ../lib/IR/IRBuilder.cpp @@ -247,6 +247,64 @@ return createCallHelper(TheFn, Ops, this, Name); } +/// Create a call to a Masked Load intrinsic. +/// Ptrs - the vector of pointers for loading +/// Align - alignment for one element +/// Mask - an vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// PassThru - a pass-through value that is used to fill the masked-off lanes +/// of the result +/// Name - name of the result variable +CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align, + Value *Mask, Value *PassThru, + const Twine& Name) { + auto *PtrsTy = dyn_cast(Ptrs->getType()); + assert(PtrsTy && PtrsTy->getElementType()->isPointerTy() && + "Expected vector of pointers for gather"); + + PointerType *PtrTy = cast(PtrsTy->getElementType()); + unsigned NumElts = PtrsTy->getVectorNumElements(); + Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts); + + if (!Mask) + Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), + NumElts)); + + Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)}; + return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name); +} + +/// Create a call to a Masked Scatter intrinsic. +/// Val - the data to be stored, +/// Ptrs - the vector of pointers, where the Val elements should be stored +/// Align - alignment for one element +/// Mask - an vector of booleans which indicates what vector lanes should +/// be accessed in memory +CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, + unsigned Align, Value *Mask) { + auto *PtrsTy = dyn_cast(Ptrs->getType()); + auto *DataTy = dyn_cast(Data->getType()); + + assert(PtrsTy && PtrsTy->getElementType()->isPointerTy() && + "Expected vector of pointers for scatter"); + + assert(DataTy && "Unexpected data type for scatter"); + + PointerType *PtrTy = cast(PtrsTy->getElementType()); + unsigned NumElts = PtrsTy->getVectorNumElements(); + + assert(NumElts == DataTy->getVectorNumElements() && + PtrTy->getElementType() == DataTy->getElementType() && + "Incopatible pointer and data types"); + + if (!Mask) + Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), + NumElts)); + Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask}; + // Type of the data to be stored - the only one overloaded type + return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy); +} + template static std::vector getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes, Index: ../lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.h +++ ../lib/Target/X86/X86TargetTransformInfo.h @@ -76,7 +76,8 @@ unsigned AddressSpace); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); - + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool Masked, unsigned Alignment); int getAddressComputationCost(Type *PtrTy, bool IsComplex); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); Index: ../lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.cpp +++ ../lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,6 +1297,11 @@ return X86TTIImpl::getIntImmCost(Imm, Ty); } +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool Masked, + unsigned Alignment) { + return 1; +} bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../lib/Transforms/Vectorize/LoopVectorize.cpp @@ -422,6 +422,14 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant. This function + /// does not broadcast them into a vector. The value remains scalar. + /// The function is used to vectorize GEP. + VectorParts getVectorOrScalarValue(Value *V); + /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -1282,6 +1290,17 @@ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } + /// Returns true if the target machine supports masked scatter operation + /// for the given \p DataType. + bool isLegalMaskedScatter(Type *DataType) { + return TTI->isLegalMaskedScatter(DataType); + } + /// Returns true if the target machine supports masked gather operation + /// for the given \p DataType. + bool isLegalMaskedGather(Type *DataType) { + return TTI->isLegalMaskedGather(DataType); + } + /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction* I) { @@ -2092,6 +2111,25 @@ return WidenMap.splat(V, B); } +// Return a vector value if it already exists. If not, do not broadcast +// the scalar, just splat it. +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::getVectorOrScalarValue(Value *V) { + assert(V != Induction && "The new induction variable should not be used."); + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + + // If we have a stride that is replaced by one, do it here. + if (Legal->hasStride(V)) + V = ConstantInt::get(V->getType(), 1); + + // If we have this scalar in the map, return it. + if (WidenMap.has(V)) + return WidenMap.get(V); + + VectorParts Scalars(UF, V); + return Scalars; +} + Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); SmallVector ShuffleMask; @@ -2379,70 +2417,103 @@ if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); - // If the pointer is loop invariant or if it is non-consecutive, - // scalarize the load. + // If the pointer is loop invariant scalarize the load. + if (LI && Legal->isUniform(Ptr)) + return scalarizeInstruction(Instr); + + // If the pointer is non-consecutive and gather/scatter is not suppored + // scalarize the instruction. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - bool UniformLoad = LI && Legal->isUniform(Ptr); - if (!ConsecutiveStride || UniformLoad) + bool CreateGatherScatter = false; + if (!ConsecutiveStride) + CreateGatherScatter = + (LI && Legal->isLegalMaskedGather(ScalarDataTy)) || + (SI && Legal->isLegalMaskedScatter(ScalarDataTy)); + + if (!ConsecutiveStride && !CreateGatherScatter) return scalarizeInstruction(Instr); Constant *Zero = Builder.getInt32(0); VectorParts &Entry = WidenMap.get(Instr); + VectorParts VectorGep; // Handle consecutive loads/stores. GetElementPtrInst *Gep = getGEPInstruction(Ptr); - if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { - setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(0, FirstBasePtr); - Gep2->setName("gep.indvar.base"); - Ptr = Builder.Insert(Gep2); - } else if (Gep) { - setDebugLocFromInst(Builder, Gep); - assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), - OrigLoop) && - "Base ptr must be invariant"); - - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - - for (unsigned i = 0; i < NumOperands; ++i) { - Value *GepOperand = Gep->getOperand(i); - Instruction *GepOperandInst = dyn_cast(GepOperand); - - // Update last index or loop invariant instruction anchored in loop. - if (i == InductionOperand || - (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == InductionOperand || - PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), - OrigLoop)) && - "Must be last index or loop invariant"); - - VectorParts &GEPParts = getVectorValue(GepOperand); - Value *Index = GEPParts[0]; - Index = Builder.CreateExtractElement(Index, Zero); - Gep2->setOperand(i, Index); - Gep2->setName("gep.indvar.idx"); + if (ConsecutiveStride) { + if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + setDebugLocFromInst(Builder, Gep); + Value *PtrOperand = Gep->getPointerOperand(); + Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; + FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(0, FirstBasePtr); + Gep2->setName("gep.indvar.base"); + Ptr = Builder.Insert(Gep2); + } else if (Gep) { + setDebugLocFromInst(Builder, Gep); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + unsigned InductionOperand = getGEPInductionOperand(Gep); + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + + for (unsigned i = 0; i < NumOperands; ++i) { + Value *GepOperand = Gep->getOperand(i); + Instruction *GepOperandInst = dyn_cast(GepOperand); + + // Update last index or loop invariant instruction anchored in loop. + if (i == InductionOperand || + (GepOperandInst && OrigLoop->contains(GepOperandInst))) { + assert((i == InductionOperand || + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && + "Must be last index or loop invariant"); + + VectorParts &GEPParts = getVectorValue(GepOperand); + Value *Index = GEPParts[0]; + Index = Builder.CreateExtractElement(Index, Zero); + Gep2->setOperand(i, Index); + Gep2->setName("gep.indvar.idx"); + } + } + Ptr = Builder.Insert(Gep2); + } else { // No GEP + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + } + } else { + // Supported random access (gather / scatter) + assert(CreateGatherScatter && "The instruction should be scalarized"); + if (Gep) { + SmallVector OpsV; + for (unsigned i = 0; i < Gep->getNumOperands(); i++) + OpsV.push_back(getVectorOrScalarValue(Gep->getOperand(i))); + + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + for (unsigned i = 1; i < Gep->getNumOperands(); i++) + Ops.push_back(OpsV[i][Part]); + Value *GEPBasePtr = OpsV[0][Part]; + Value *NewGep = Builder.CreateGEP(nullptr, GEPBasePtr, Ops, + "VectorGep"); + NewGep = Builder.CreateBitCast(NewGep, + VectorType::get(Ptr->getType(), VF)); + VectorGep.push_back(NewGep); + } + } else { + VectorGep = getVectorValue(Ptr); } } - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa(Ptr) && "Invalid induction ptr"); - setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); - } VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -2455,30 +2526,38 @@ VectorParts StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = + Instruction *NewSI; + if (CreateGatherScatter) { + Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; + NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part], + Alignment, MaskPart); + } + else { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal[Part] = reverseVector(StoredVal[Part]); - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); - } + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); + } - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); - Instruction *NewSI; - if (Legal->isMaskRequired(SI)) - NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, - Mask[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + if (Legal->isMaskRequired(SI)) + NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, + Mask[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, + Alignment); + } propagateMetadata(NewSI, SI); } return; @@ -2488,29 +2567,37 @@ assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); + Instruction* NewLI; + if (CreateGatherScatter) { + Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; + NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, + MaskPart, 0, + "wide.masked.gather"); + Entry[Part] = NewLI; } + else { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - Instruction* NewLI; - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); - if (Legal->isMaskRequired(LI)) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); - else - NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide load needs to start at the last vector element. + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); + if (Legal->isMaskRequired(LI)) + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + UndefValue::get(DataTy), + "wide.masked.load"); + else + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; + } propagateMetadata(NewLI, LI); - Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } @@ -4501,7 +4588,8 @@ if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { + if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || + isLegalMaskedGather(LI->getType())) { MaskedOp.insert(LI); continue; } @@ -4526,7 +4614,8 @@ // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), - SI->getPointerOperand()); + SI->getPointerOperand()) || + isLegalMaskedScatter(SI->getValueOperand()->getType()); if (isLegalMaskedOp) { --NumPredStores; MaskedOp.insert(SI); @@ -5262,6 +5351,17 @@ return Cost; } +/// \brief Check if the load/store instruction may be transfered to +/// gather/scatter during vectorization. +/// +/// It is possible when gather/scatter is supported by target. +static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, + LoopVectorizationLegality *Legal) { + Type *DataTy = cast(Ptr->getType())->getElementType(); + return (isa(I) && Legal->isLegalMaskedGather(DataTy)) || + (isa(I) && Legal->isLegalMaskedScatter(DataTy)); +} + /// \brief Check whether the address computation for a non-consecutive memory /// access looks like an unlikely candidate for being merged into the indexing /// mode. @@ -5481,11 +5581,15 @@ // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool UseGatherOrScatter = (ConsecutiveStride == 0) && + isGatherOrScatterLegal(I, Ptr, Legal); + bool Reverse = ConsecutiveStride < 0; const DataLayout &DL = I->getModule()->getDataLayout(); unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; - if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + if ((!ConsecutiveStride && !UseGatherOrScatter) || + ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; @@ -5509,8 +5613,13 @@ return Cost; } - // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); + if (UseGatherOrScatter) + return Cost + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); + + // Wide load/stores. if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); Index: ../test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ ../test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -0,0 +1,236 @@ +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +;AVX1-NOT: llvm.masked + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { +; +; for (int i=0; i < SIZE; ++i) { +; if (trigger[i] > 0) { +; out[i] = in[index[i]] + (float) 0.5; +; } +; } +;} + +;AVX512-LABEL: @foo1 +;AVX512: llvm.masked.load +;AVX512: llvm.masked.gather +;AVX512: llvm.masked.store +;AVX512: ret void + +; Function Attrs: nounwind uwtable +define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { +entry: + %in.addr = alloca float*, align 8 + %out.addr = alloca float*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store float* %in, float** %in.addr, align 8 + store float* %out, float** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load i32*, i32** %index.addr, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 + %6 = load i32, i32* %arrayidx3, align 4 + %idxprom4 = sext i32 %6 to i64 + %7 = load float*, float** %in.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 + %8 = load float, float* %arrayidx5, align 4 + %add = fadd float %8, 5.000000e-01 + %9 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %9 to i64 + %10 = load float*, float** %out.addr, align 8 + %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6 + store float %add, float* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %11 = load i32, i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; The source code +;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { +; +; for (int i=0; i 0) { +; out[i] = in[i].b + (float) 0.5; +; } +; } +;} + +%struct.In = type { float, float } + +;AVX512-LABEL: @foo2 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.gather.v16f32 +;AVX512: llvm.masked.store +;AVX512: ret void +define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { +entry: + %in.addr = alloca %struct.In*, align 8 + %out.addr = alloca float*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In* %in, %struct.In** %in.addr, align 8 + store float* %out, float** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In*, %struct.In** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 + %6 = load float, float* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load float*, float** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 + store float %add, float* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; The source code +;struct Out { +; float a; +; float b; +;}; +;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { +; +; for (int i=0; i 0) { +; out[i].b = in[i].b + (float) 0.5; +; } +; } +;} + +;AVX512-LABEL: @foo3 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.gather.v16f32 +;AVX512: fadd <16 x float> +;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.scatter.v16f32 +;AVX512: ret void + +%struct.Out = type { float, float } + +define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { +entry: + %in.addr = alloca %struct.In*, align 8 + %out.addr = alloca %struct.Out*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In* %in, %struct.In** %in.addr, align 8 + store %struct.Out* %out, %struct.Out** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In*, %struct.In** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 + %6 = load float, float* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load %struct.Out*, %struct.Out** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4 + %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1 + store float %add, float* %b6, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} +declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>) Index: ../test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ ../test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -280,7 +280,7 @@ ;AVX2: ret void ;AVX512-LABEL: @foo4 -;AVX512-NOT: llvm.masked +;AVX512-NOT: llvm.masked.load ;AVX512: ret void ; Function Attrs: nounwind uwtable