Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -743,6 +743,8 @@ bool isLegalMaskedStore(Type *DataType, Align Alignment) const; /// Return true if the target supports masked load. bool isLegalMaskedLoad(Type *DataType, Align Alignment) const; + /// Return true if the target supports masked load. + bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const; /// Return true if the target supports nontemporal store. bool isLegalNTStore(Type *DataType, Align Alignment) const; @@ -757,6 +759,8 @@ bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType, Align Alignment) const; + /// Return true if the target supports masked gather prefetch. + bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const; /// Return true if the target forces scalarizing of llvm.masked.gather /// intrinsics. bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const; @@ -1769,12 +1773,14 @@ getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0; virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; + virtual bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const = 0; virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) = 0; virtual bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) = 0; virtual bool forceScalarizeMaskedScatter(VectorType *DataType, @@ -2225,6 +2231,9 @@ bool isLegalMaskedLoad(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedLoad(DataType, Alignment); } + bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) override { + return Impl.isLegalMaskedPrefetch(DataType, Alignment); + } bool isLegalNTStore(Type *DataType, Align Alignment) override { return Impl.isLegalNTStore(DataType, Alignment); } @@ -2241,6 +2250,9 @@ bool isLegalMaskedGather(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedGather(DataType, Alignment); } + bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) override { + return Impl.isLegalMaskedGatherPrefetch(DataType, Alignment); + } bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) override { return Impl.forceScalarizeMaskedGather(DataType, Alignment); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -254,6 +254,10 @@ return false; } + bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const { + return false; + } + bool isLegalNTStore(Type *DataType, Align Alignment) const { // By default, assume nontemporal memory stores are available for stores // that are aligned and have a size that is a power of 2. @@ -280,6 +284,10 @@ return false; } + bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const { + return false; + } + bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const { return false; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1556,6 +1556,15 @@ return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], VarMask, Alignment, CostKind, I); } + case Intrinsic::masked_gather_prefetch: { + const Value *Mask = Args[4]; + bool VarMask = !isa(Mask); + Align Alignment = cast(Args[1])->getAlignValue(); + auto *MaskVT = cast(Mask->getType()); + auto *PsudoDataTy = MaskVT->getWithNewBitWidth(Alignment.value()*8); + return thisT()->getGatherScatterOpCost(Instruction::Call, PsudoDataTy, Args[0], + VarMask, Alignment, CostKind, I); + } case Intrinsic::experimental_stepvector: { if (isa(RetTy)) return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -1870,6 +1879,13 @@ return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, CostKind); } + case Intrinsic::masked_prefetch: { + auto *MaskVT = cast(ICA.getArgTypes()[4]); + Type *PsudoTy = MaskVT->getWithNewBitWidth(32); + Align TyAlign = thisT()->DL.getABITypeAlign(PsudoTy); + return thisT()->getMaskedMemoryOpCost(Instruction::Call, PsudoTy, TyAlign, 0, + CostKind); + } case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, std::nullopt, CostKind); Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -792,6 +792,11 @@ CallInst *CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask); + /// Create a call to Masked Load intrinsic + CallInst *CreateMaskedPrefetch(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, + Value *RW = nullptr, Value *Locality = nullptr, + const Twine &Name = ""); + /// Create a call to Masked Gather intrinsic CallInst *CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask = nullptr, Value *PassThru = nullptr, @@ -801,6 +806,12 @@ CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, Align Alignment, Value *Mask = nullptr); + /// Create a call to Masked Gather Prefetch intrinsic + CallInst *CreateMaskedGatherPrefetch(Type *Ty, Value *Ptrs, Align Alignment, + Value *Mask = nullptr, Value *RW = nullptr, + Value *Locality = nullptr, + const Twine &Name = ""); + /// Create a call to Masked Expand Load intrinsic CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr, Value *PassThru = nullptr, Index: llvm/include/llvm/IR/IntrinsicInst.h =================================================================== --- llvm/include/llvm/IR/IntrinsicInst.h +++ llvm/include/llvm/IR/IntrinsicInst.h @@ -1328,6 +1328,106 @@ } }; +/// This class prefetch intrinsic +/// i.e. llvm.prefetch +class PrefetchInst : public IntrinsicInst { +public: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::prefetch; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + Value *getPointerOperand() { return getOperand(0); } + const Value *getPointerOperand() const { return getOperand(0); } + static unsigned getPointerOperandIndex() { return 0U; } + Type *getPointerOperandType() const { return getPointerOperand()->getType(); } +}; + +/// A helper function that returns the pointer operand of a prefetch +/// instruction. Returns nullptr if not prefetch. +inline const Value *getPrefetchPointerOperand(const Value *V) { + if (auto *Prefetch = dyn_cast(V)) + return Prefetch->getPointerOperand(); + return nullptr; +} +inline Value *getPrefetchPointerOperand(Value *V) { + return const_cast( + getPrefetchPointerOperand(static_cast(V))); +} + +/// A helper function that returns the address space of the pointer operand of +/// prefetch instruction. +inline unsigned getPrefetchAddressSpace(Value *I) { + assert(isa(I) && "Expected prefetch instruction"); + auto *PtrTy = dyn_cast(I)->getPointerOperandType(); + return dyn_cast(PtrTy)->getAddressSpace(); +} + +/// A helper function that returns the type of a prefetch instruction. +inline Type *getPrefetchType(Value *I) { + assert(isa(I) && "Expected Prefetch instruction"); + auto *Prefetch = dyn_cast(I); + auto *GEP = dyn_cast(Prefetch->getPointerOperand()); + if (GEP) { + auto *ElemTy = GEP->getSourceElementType(); + auto *TempTy = dyn_cast(ElemTy); + while (TempTy) { + ElemTy = TempTy->getArrayElementType(); + TempTy = dyn_cast(ElemTy); + } + return isa(ElemTy) ? Type::getInt64Ty(I->getContext()) : ElemTy; + } + auto *Alloca = dyn_cast(Prefetch->getPointerOperand()); + if (Alloca) { + auto *ElemTy = Alloca->getAllocatedType()->getArrayElementType(); + return isa(ElemTy) ? Type::getInt64Ty(I->getContext()) : ElemTy; + } + return nullptr; +} + +/// A helper function that returns the alignment of prefetch instruction. +inline Align getPrefetchAlignment(Value *I) { + assert(isa(I) && "Expected Prefetch instruction"); + auto *Ty = getPrefetchType(I); + return Ty? Align(Ty->getScalarSizeInBits()>>3) : Align(1ULL); +} + +/// A helper function that returns the alignment of load/store/prefetch instruction. +inline Align getLdStPfAlignment(Value *I) { + if (isa(I)) + return getPrefetchAlignment(I); + return getLoadStoreAlignment(I); +} + +/// A helper function that returns the pointer operand of a load/store/prefetch +/// instruction. Returns nullptr if not prefetch. +inline const Value *getLdStPfPointerOperand(const Value *I) { + if (isa(I)) + return getPrefetchPointerOperand(I); + return getLoadStorePointerOperand(I); +} +inline Value *getLdStPfPointerOperand(Value *V) { + return const_cast( + getLdStPfPointerOperand(static_cast(V))); +} + +/// A helper function that returns the address space of the pointer operand of +/// load/store/prefetch instruction. +inline unsigned getLdStPfAddressSpace(Value *I) { + if (isa(I)) + return getPrefetchAddressSpace(I); + return getLoadStoreAddressSpace(I); +} + +/// A helper function that returns the type of a load/store/prefetch instruction. +inline Type *getLdStPfType(Value *I) { + if (isa(I)) + return getPrefetchType(I); + return getLoadStoreType(I); +} + /// This class represents any memmove intrinsic /// i.e. llvm.element.unordered.atomic.memmove /// and llvm.memmove Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -2208,6 +2208,21 @@ [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, NoCapture>]>; +def int_masked_prefetch: + DefaultAttrsIntrinsic<[], + [llvm_anyptr_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty], + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, + ImmArg>, ImmArg>]>; + +def int_masked_gather_prefetch: + DefaultAttrsIntrinsic<[], + [llvm_anyvector_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, + ImmArg>, ImmArg>]>; + // Test whether a pointer is associated with a type metadata identifier. def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], [IntrNoMem, IntrWillReturn, IntrSpeculatable]>; Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -433,6 +433,11 @@ return TTIImpl->isLegalMaskedLoad(DataType, Alignment); } +bool TargetTransformInfo::isLegalMaskedPrefetch(Type *DataType, + Align Alignment) const { + return TTIImpl->isLegalMaskedPrefetch(DataType, Alignment); +} + bool TargetTransformInfo::isLegalNTStore(Type *DataType, Align Alignment) const { return TTIImpl->isLegalNTStore(DataType, Alignment); @@ -481,6 +486,11 @@ return TTIImpl->isLegalMaskedExpandLoad(DataType); } +bool TargetTransformInfo::isLegalMaskedGatherPrefetch(Type *DataType, + Align Alignment) const { + return TTIImpl->isLegalMaskedGatherPrefetch(DataType, Alignment); +} + bool TargetTransformInfo::enableOrderedReductions() const { return TTIImpl->enableOrderedReductions(); } Index: llvm/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/lib/Analysis/VectorUtils.cpp +++ llvm/lib/Analysis/VectorUtils.cpp @@ -92,6 +92,7 @@ case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::prefetch: return true; default: return false; Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -602,6 +602,30 @@ return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes); } +/// Create a call to a Masked Prefetch intrinsic. +/// \p Ty - vector type to load +/// \p Ptr - base pointer for the load +/// \p Alignment - alignment of the destination location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p RW - Read or Write +/// \p Locality - Cache Level +/// \p Name - name of the result variable +CallInst *IRBuilderBase::CreateMaskedPrefetch(Type *Ty, Value *Ptr, + Align Alignment, + Value *Mask, Value *RW, + Value *Locality, + const Twine &Name) { + auto *PtrTy = cast(Ptr->getType()); + assert(Ty->isVectorTy() && "Type should be vector"); + assert(PtrTy->isOpaqueOrPointeeTypeMatches(Ty) && "Wrong element type"); + assert(Mask && "Mask should not be all-ones (null)"); + Type *OverloadedTypes[] = {PtrTy, Mask->getType()}; + Value *Ops[] = {Ptr, getInt32(Alignment.value()), RW, Locality, Mask}; + return CreateMaskedIntrinsic(Intrinsic::masked_prefetch, Ops, + OverloadedTypes, Name); +} + /// Create a call to a Masked intrinsic, with given intrinsic Id, /// an array of operands - Ops, and an array of overloaded types - /// OverloadedTypes. @@ -708,6 +732,41 @@ OverloadedTypes); } +/// Create a call to a Masked Gather Prefetch intrinsic. +/// \p Ty - vector type to gather +/// \p Ptrs - vector of pointers for loading +/// \p Align - alignment of the destination location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p RW - Read or Write +/// \p Locality - Cache Level +/// \p Name - name of the result variable +CallInst *IRBuilderBase::CreateMaskedGatherPrefetch(Type *Ty, Value *Ptrs, Align Alignment, + Value *Mask, Value *RW, + Value *Locality, + const Twine &Name) { + auto *VecTy = cast(Ty); + ElementCount NumElts = VecTy->getElementCount(); + auto *PtrsTy = cast(Ptrs->getType()); + assert(cast(PtrsTy->getElementType()) + ->isOpaqueOrPointeeTypeMatches( + cast(Ty)->getElementType()) && + "Element type mismatch"); + assert(NumElts == PtrsTy->getElementCount() && "Element count mismatch"); + + if (!Mask) + Mask = Constant::getAllOnesValue( + VectorType::get(Type::getInt1Ty(Context), NumElts)); + + Type *OverloadedTypes[] = {PtrsTy}; + Value *Ops[] = {Ptrs, getInt32(Alignment.value()), RW, Locality, Mask}; + + // We specify only one type when we create this intrinsic. Types of other + // arguments are derived from this type. + return CreateMaskedIntrinsic(Intrinsic::masked_gather_prefetch, Ops, OverloadedTypes, + Name); +} + template static std::vector getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes, Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -600,7 +600,7 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, ElementCount VF) const { - Value *Ptr = getLoadStorePointerOperand(&I); + Value *Ptr = getLdStPfPointerOperand(&I); if (!Ptr) return false; // Note: There's nothing inherent which prevents predicated loads and Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1470,19 +1470,28 @@ TTI.isLegalMaskedLoad(DataType, Alignment); } + /// Returns true if the target machine supports masked prefetch operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedPrefetch(Type *DataType, Value *Ptr, Align Alignment) const { + return Legal->isConsecutivePtr(DataType, Ptr) && + TTI.isLegalMaskedPrefetch(DataType, Alignment); + } + /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. bool isLegalGatherOrScatter(Value *V, ElementCount VF) { bool LI = isa(V); bool SI = isa(V); - if (!LI && !SI) + bool PF = isa(V); + if (!LI && !SI && !PF) return false; - auto *Ty = getLoadStoreType(V); - Align Align = getLoadStoreAlignment(V); + auto *Ty = getLdStPfType(V); + Align Align = getLdStPfAlignment(V); if (VF.isVector()) Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || - (SI && TTI.isLegalMaskedScatter(Ty, Align)); + (SI && TTI.isLegalMaskedScatter(Ty, Align)) || + (PF && TTI.isLegalMaskedPrefetch(Ty, Align)); } /// Returns true if the target machine supports all of the reduction @@ -4401,8 +4410,18 @@ switch(I->getOpcode()) { default: return true; - case Instruction::Call: - return !VFDatabase::hasMaskedVariant(*(cast(I)), VF); + case Instruction::Call: { + if (!isa(I)) + return !VFDatabase::hasMaskedVariant(*(cast(I)), VF); + auto *Ptr = getPrefetchPointerOperand(I); + auto *Ty = getPrefetchType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); + const Align Alignment = getPrefetchAlignment(I); + return !(isLegalMaskedPrefetch(Ty, Ptr, Alignment) || + TTI.isLegalMaskedGatherPrefetch(VTy, Alignment)); + } case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); @@ -4609,10 +4628,10 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. - assert((isa(I)) && "Invalid memory instruction"); + assert((isa(I)) && "Invalid memory instruction"); - auto *Ptr = getLoadStorePointerOperand(I); - auto *ScalarTy = getLoadStoreType(I); + auto *Ptr = getLdStPfPointerOperand(I); + auto *ScalarTy = getLdStPfType(I); // In order to be widened, the pointer should be consecutive, first of all. if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) @@ -6469,11 +6488,11 @@ if (VF.isScalable()) return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); + Type *ValTy = getLdStPfType(I); auto SE = PSE.getSE(); - unsigned AS = getLoadStoreAddressSpace(I); - Value *Ptr = getLoadStorePointerOperand(I); + unsigned AS = getLdStPfAddressSpace(I); + Value *Ptr = getLdStPfPointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` // that it is being called from this specific place. @@ -6489,7 +6508,7 @@ // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - const Align Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLdStPfAlignment(I); Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS, CostKind); @@ -6524,16 +6543,16 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, ElementCount VF) { - Type *ValTy = getLoadStoreType(I); + Type *ValTy = getLdStPfType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); - Value *Ptr = getLoadStorePointerOperand(I); - unsigned AS = getLoadStoreAddressSpace(I); + Value *Ptr = getLdStPfPointerOperand(I); + unsigned AS = getLdStPfAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); - const Align Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLdStPfAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, @@ -6556,11 +6575,16 @@ ElementCount VF) { assert(Legal->isUniformMemOp(*I, VF)); - Type *ValTy = getLoadStoreType(I); + Type *ValTy = getLdStPfType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); - const Align Alignment = getLoadStoreAlignment(I); - unsigned AS = getLoadStoreAddressSpace(I); + const Align Alignment = getLdStPfAlignment(I); + unsigned AS = getLdStPfAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (isa(I)) { + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Call, ValTy, Alignment, AS, + CostKind); + } if (isa(I)) { return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, @@ -6582,10 +6606,10 @@ InstructionCost LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, ElementCount VF) { - Type *ValTy = getLoadStoreType(I); + Type *ValTy = getLdStPfType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); - const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); + const Align Alignment = getLdStPfAlignment(I); + const Value *Ptr = getLdStPfPointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( @@ -6811,9 +6835,9 @@ // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF.isScalar()) { - Type *ValTy = getLoadStoreType(I); - const Align Alignment = getLoadStoreAlignment(I); - unsigned AS = getLoadStoreAddressSpace(I); + Type *ValTy = getLdStPfType(I); + const Align Alignment = getLdStPfAlignment(I); + unsigned AS = getLdStPfAddressSpace(I); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); return TTI.getAddressComputationCost(ValTy) + @@ -6914,7 +6938,7 @@ for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the old loop. for (Instruction &I : *BB) { - Value *Ptr = getLoadStorePointerOperand(&I); + Value *Ptr = getLdStPfPointerOperand(&I); if (!Ptr) continue; @@ -6974,7 +6998,7 @@ if (memoryInstructionCanBeWidened(&I, VF)) { InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr( - getLoadStoreType(&I), getLoadStorePointerOperand(&I)); + getLdStPfType(&I), getLdStPfPointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = @@ -7408,6 +7432,20 @@ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + if (isa(I)) { + ElementCount Width = VF; + if (Width.isVector()) { + InstWidening Decision = getWideningDecision(I, Width); + assert(Decision != CM_Unknown && + "CM decision should be taken at this point"); + if (getWideningCost(I, VF) == InstructionCost::getInvalid()) + return InstructionCost::getInvalid(); + if (Decision == CM_Scalarize) + Width = ElementCount::getFixed(1); + } + VectorTy = ToVectorTy(getLdStPfType(I), Width); + return getMemoryInstructionCost(I, VF); + } if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; @@ -8240,7 +8278,7 @@ ArrayRef Operands, VFRange &Range, VPlanPtr &Plan) { - assert((isa(I) || isa(I)) && + assert((isa(I) || isa(I) || isa(I)) && "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { @@ -8275,6 +8313,10 @@ return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, Consecutive, Reverse); + if (PrefetchInst *Prefetch = dyn_cast(I)) + return new VPWidenMemoryInstructionRecipe(*Prefetch, Operands[0], Mask, + Consecutive, Reverse); + StoreInst *Store = cast(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], Mask, Consecutive, Reverse); @@ -8694,10 +8736,12 @@ [&](ElementCount VF) { return VF.isScalar(); }, Range)) return nullptr; - if (auto *CI = dyn_cast(Instr)) + if (isa(Instr) && !isa(Instr)) { + auto *CI = dyn_cast(Instr); return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); + } - if (isa(Instr) || isa(Instr)) + if (isa(Instr) || isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); if (!shouldWiden(Instr, Range)) @@ -9626,7 +9670,7 @@ if (IsUniform) { // If the recipe is uniform across all parts (instead of just per VF), only // generate a single instance. - if ((isa(UI) || isa(UI)) && + if ((isa(UI) || isa(UI) || isa(UI)) && all_of(operands(), [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); })) { @@ -9656,6 +9700,16 @@ return; } + // A prefetch of a loop varying value to a uniform address only needs the last + // copy of the store. + if (isa(UI) && + vputils::isUniformAfterVectorization(getOperand(0))) { + auto Lane = VPLane::getLastLaneForVF(State.VF); + State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), + State); + return; + } + // Generate scalar instances for all VF lanes of all UF parts. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); @@ -9670,15 +9724,17 @@ // Attempt to issue a wide load. LoadInst *LI = dyn_cast(&Ingredient); StoreInst *SI = dyn_cast(&Ingredient); + PrefetchInst *PF = dyn_cast(&Ingredient); - assert((LI || SI) && "Invalid Load/Store instruction"); + assert((LI || SI || PF) && "Invalid Load/Store/Prefetch instruction"); assert((!SI || StoredValue) && "No stored value provided for widened store"); assert((!LI || !StoredValue) && "Stored value provided for widened load"); + assert((!PF || !StoredValue) && "Stored value provided for widened prefetch"); - Type *ScalarDataTy = getLoadStoreType(&Ingredient); + Type *ScalarDataTy = getLdStPfType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); + const Align Alignment = getLdStPfAlignment(&Ingredient); bool CreateGatherScatter = !isConsecutive(); auto &Builder = State.Builder; @@ -9762,6 +9818,37 @@ return; } + if (PF) { + State.setDebugLocFromInst(PF); + Value *RW = PF->getArgOperand(1); + Value *Locality = PF->getArgOperand(2); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewPF = nullptr; + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewPF = Builder.CreateMaskedGatherPrefetch(DataTy, VectorGep, Alignment, + MaskPart, RW, Locality); + } else { + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewPF = Builder.CreateMaskedPrefetch(DataTy, VecPtr, Alignment, + BlockInMaskParts[Part], + RW, Locality); + else { + auto *MaskPart = Constant::getAllOnesValue( + VectorType::get(Type::getInt1Ty(DataTy->getContext()), DataTy)); + NewPF = Builder.CreateMaskedPrefetch(DataTy, VecPtr, Alignment, + MaskPart, RW, Locality); + } + } + State.addMetadata(NewPF, PF); + } + return; + } + // Handle loads. assert(LI && "Must have a load instruction"); State.setDebugLocFromInst(LI); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -1901,7 +1901,8 @@ } bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + return isPrefetch() ? getNumOperands() == 5 : + isStore() ? getNumOperands() == 3 : getNumOperands() == 2; } public: @@ -1923,6 +1924,14 @@ setMask(Mask); } + VPWidenMemoryInstructionRecipe(PrefetchInst &Prefetch, VPValue *Addr, VPValue *Mask, + bool Consecutive, bool Reverse) + : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}), + Ingredient(Prefetch), Consecutive(Consecutive), Reverse(Reverse) { + assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + setMask(Mask); + } + VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC) /// Return the address accessed by this recipe. @@ -1940,6 +1949,9 @@ /// Returns true if this recipe is a store. bool isStore() const { return isa(Ingredient); } + /// Returns true if this recipe is a prefetch. + bool isPrefetch() const { return isa(Ingredient); } + /// Return the address accessed by this recipe. VPValue *getStoredValue() const { assert(isStore() && "Stored value only available for store instructions"); Index: llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -135,13 +135,17 @@ "underlying instruction has side-effects"); return false; } - case VPWidenMemoryInstructionSC: + case VPWidenMemoryInstructionSC: { + auto *R = cast(this); + if (isa(R->getIngredient())) + return true; assert(cast(this) ->getIngredient() .mayHaveSideEffects() == mayWriteToMemory() && "mayHaveSideffects result for ingredient differs from this " "implementation"); return mayWriteToMemory(); + } case VPReplicateSC: { auto *R = cast(this); return R->getUnderlyingInstr()->mayHaveSideEffects(); @@ -1081,7 +1085,7 @@ VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; - if (!isStore()) { + if (!isStore() && !isPrefetch()) { getVPSingleValue()->printAsOperand(O, SlotTracker); O << " = "; }