Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -588,7 +588,8 @@ bool enableInterleavedAccessVectorization() const; /// Enable matching of interleaved access groups that contain predicated - /// accesses and are vectorized using masked vector loads/stores. + /// accesses or gaps and therefore vectorized using masked + /// vector loads/stores. bool enableMaskedInterleavedAccessVectorization() const; /// Indicate that it is potentially unsafe to automatically vectorize @@ -826,10 +827,11 @@ /// \p Alignment is the alignment of the memory operation /// \p AddressSpace is address space of the pointer. /// \p IsMasked indicates if the memory access is predicated. + /// \p UseMaskForGaps indicates if gaps should be masked. int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked = false) const; + unsigned AddressSpace, bool IsMasked = false, + bool UseMaskForGaps = false) const; /// Calculate the cost of performing a vector reduction. /// @@ -1140,7 +1142,8 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false) = 0; + bool IsMasked = false, + bool UseMaskForGaps = false) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, @@ -1482,9 +1485,11 @@ } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked) override { + unsigned AddressSpace, bool IsMasked, + bool UseMaskForGaps) override { return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -453,7 +453,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false) { + bool IsMasked = false, + bool UseMaskForGaps = false) { return 1; } Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -24,6 +24,7 @@ template class ArrayRef; class DemandedBits; class GetElementPtrInst; +class InterleaveGroup; class Loop; class ScalarEvolution; class TargetTransformInfo; @@ -125,6 +126,19 @@ /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); +/// Create a mask that masks away gaps of an interleave group. +/// +/// For example, the mask for \p Group with interleave-factor 3 +/// and \p VF 4, that has only its first member present is: +/// +/// <1,0,0,1,0,0,1,0,0,1,0,0> +/// +/// Note: The result is a mask of 0's and 1's, as opposed to the other +/// create[*]Mask() utilities which create a shuffle mask (mask that +/// consists of indices). +Constant *createTrueFalseMaskForGaps(IRBuilder<> &Builder, unsigned VF, + const InterleaveGroup &Group); + /// Create a mask with replicated elements. /// /// This function creates a shuffle mask for replicating each of the \p VF @@ -405,10 +419,10 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } - /// Invalidate groups that require a scalar epilogue (due to gaps). This can - /// happen when we optimize for size and don't allow creating a scalar - /// epilogue. - void invalidateGroupsRequiringScalarEpilogue(); + /// Invalidate groups that require a scalar epilogue (due to gaps), if there + /// is no other means (such as masking) to support them. This can happen when + /// we optimize for size and don't allow creating a scalar epilogue. + void invalidateGroupsRequiringScalarEpilogue(bool EnabledMaskedInterleave); private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -785,7 +785,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false) { + bool IsMasked = false, + bool UseMaskForGaps = false) { VectorType *VT = dyn_cast(VecTy); assert(VT && "Expect a vector type for interleaved memory op"); @@ -797,7 +798,7 @@ // Firstly, the cost of load/store operation. unsigned Cost; - if (IsMasked) + if (IsMasked || UseMaskForGaps) Cost = static_cast(this)->getMaskedMemoryOpCost( Opcode, VecTy, Alignment, AddressSpace); else @@ -923,6 +924,15 @@ Cost += static_cast(this)->getVectorInstrCost( Instruction::InsertElement, MaskVT, i); + // The Gaps mask is invariant and created outside the loop, therefore the + // cost of creating it is not accounted for here. However if we have both + // a MaskForGaps and some other mask that guards the execution of the + // memory access, we need to account for the cost of And-ing the two masks + // inside the loop. + if (UseMaskForGaps) + Cost += static_cast(this)->getArithmeticInstrCost( + BinaryOperator::And, MaskVT); + return Cost; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -519,9 +519,11 @@ int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost( - Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); + unsigned Alignment, unsigned AddressSpace, bool IsMasked, + bool UseMaskForGaps) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, + IsMasked, UseMaskForGaps); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -504,6 +504,25 @@ return Inst; } +Constant *llvm::createTrueFalseMaskForGaps(IRBuilder<> &Builder, unsigned VF, + const InterleaveGroup &Group) { + // All 1's means mask is not needed. + if (Group.getNumMembers() == Group.getFactor()) + return nullptr; + + // TODO: support reversed access. + assert(!Group.isReverse() && "Reversed group not supported."); + + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < Group.getFactor(); ++j) { + unsigned HasMember = Group.getMember(j) ? 1 : 0; + Mask.push_back(Builder.getInt1(HasMember)); + } + + return ConstantVector::get(Mask); +} + Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, unsigned VF) { SmallVector MaskVec; @@ -920,7 +939,8 @@ } } -void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { +void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue( + bool EnabledMaskedInterleave) { // If no group had triggered the requirement to create an epilogue loop, // there is nothing to do. if (!requiresScalarEpilogue()) @@ -930,14 +950,15 @@ SmallPtrSet DelSet; for (auto &I : InterleaveGroupMap) { InterleaveGroup *Group = I.second; - if (Group->requiresScalarEpilogue()) + if (Group->requiresScalarEpilogue() && !EnabledMaskedInterleave) DelSet.insert(Group); } for (auto *Ptr : DelSet) { LLVM_DEBUG( - dbgs() + dbgs() << "LV: Invalidate candidate interleaved group due to gaps that " - "require a scalar epilogue.\n"); + "require a scalar epilogue (not allowed under optsize) or masking " + "of interleave-groups (not enabled). \n"); releaseGroup(Ptr); } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,8 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace, bool IsMasked = false, + bool UseMaskForGaps = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -660,11 +660,13 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool IsMasked, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!IsMasked && !UseMaskForGaps && + Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -677,7 +679,8 @@ } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,8 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace, bool IsMasked = false, + bool UseMaskForGaps = false); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -545,7 +545,8 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool IsMasked, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); @@ -553,7 +554,7 @@ bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && - !IsMasked) { + !IsMasked && !UseMaskForGaps) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -566,7 +567,8 @@ } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, Index: lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,8 @@ bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace, bool IsMasked = false, + bool UseMaskForGaps = false); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) { - if (Indices.size() != Factor || IsMasked) + unsigned Alignment, unsigned AddressSpace, bool IsMasked, + bool UseMaskForGaps) { + if (Indices.size() != Factor || IsMasked || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -91,7 +91,8 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool IsMasked = false, + bool UseMaskForGaps = false); /// @} }; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -474,10 +474,12 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { - if (IsMasked) + bool IsMasked, + bool UseMaskForGaps) { + if (IsMasked || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace, bool IsMasked = false, + bool UseMaskForGaps = false); /// @} }; Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -910,10 +910,12 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { - if (IsMasked) + bool IsMasked, + bool UseMaskForGaps) { + if (IsMasked || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -102,15 +102,18 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool IsMasked = false, + bool UseMaskForGaps = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool IsMasked = false, + bool UseMaskForGaps = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool IsMasked = false, + bool UseMaskForGaps = false); int getIntImmCost(int64_t); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2728,11 +2728,13 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool IsMasked, + bool UseMaskForGaps) { - if (IsMasked) + if (IsMasked || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2842,11 +2844,13 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool IsMasked, + bool UseMaskForGaps) { - if (IsMasked) + if (IsMasked || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2965,7 +2969,8 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool IsMasked, + bool UseMaskForGaps) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2977,11 +2982,14 @@ }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, IsMasked, + UseMaskForGaps); } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,8 @@ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +/// An interleave-group may need masking if it resides in a block that needs +/// predication, or in order to mask away gaps. static cl::opt EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -1134,11 +1136,15 @@ } /// Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps. + /// to handle accesses with gaps, and there is nothing preventing us from + /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return InterleaveInfo.requiresScalarEpilogue(); + return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); } + /// Returns true if a scalar epilogue is not allowed due to optsize. + bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1229,6 +1235,15 @@ /// vectorization as a predicated block. SmallPtrSet PredicatedBBsAfterVectorization; + /// Records whether it is allowed to have the original scalar loop execute at + /// least once. This may be needed as a fallback loop in case runtime + /// aliasing/dependence checks fail, or to handle the tail/remainder + /// iterations when the trip count is unknown or doesn't divide by the VF, + /// or as a peel-loop to handle gaps in interleave-groups. + /// Under optsize we don't allow any iterations to execute in the scalar + /// loop. + bool IsScalarEpilogueAllowed = true; + /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -1938,6 +1953,17 @@ "reverse"); } +// Return whether we allow using masked interleave-groups (for dealing with +// strided loads/stores that reside in predicated blocks, or for dealing +// with gaps). +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) + return TTI.enableMaskedInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + return EnableMaskedInterleavedMemAccesses; +} + // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): @@ -1995,7 +2021,7 @@ Mask = *BlockInMask; // TODO: extend the masked interleaved-group support to reversed access. assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); + "not supported."); } // If the group is reverse, adjust the index to refer to the last vector lane @@ -2036,20 +2062,35 @@ setDebugLocFromInst(Builder, Instr); Value *UndefVec = UndefValue::get(VecTy); + Value *MaskForGaps = nullptr; + if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { + MaskForGaps = createTrueFalseMaskForGaps(Builder, VF, *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + // Vectorize the interleaved load group. if (isa(Instr)) { // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); - Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); - NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), - ShuffledMask, UndefVec, - "wide.masked.vec"); + if (IsMaskRequired || MaskForGaps) { + assert(useMaskedInterleavedAccesses(*TTI) && + "masked interleaved groups are not allowed."); + Value *GroupMask = MaskForGaps; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + GroupMask = MaskForGaps + ? Builder.CreateBinOp(Instruction::And, ShuffledMask, + MaskForGaps) + : ShuffledMask; + } + NewLoad = + Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + GroupMask, UndefVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], @@ -4329,29 +4370,32 @@ return false; } -static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { - if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) - return TTI.enableMaskedInterleavedAccessVectorization(); - - // If an override option has been passed in for interleaved accesses, use it. - return EnableMaskedInterleavedMemAccesses; -} - bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, unsigned VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); + auto *Group = getInterleavedAccessGroup(I); + assert(Group && "Must have a group."); - if (!Legal->blockNeedsPredication(I->getParent()) || - !Legal->isMaskRequired(I)) + // Check if masking is required. + // A Group may need masking for one of two reasons: it resides in a block that + // needs predication, or it was decided to use masking to deal with gaps. + bool PredicatedAccessRequiresMasking = + Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; - if (!useMaskedInterleavedAccesses(TTI)) - return false; + // If masked interleaving is required, we expect that the user/target had + // enabled it, because otherwise it either wouldn't have been created or + // it should have been invalidated by the costModel. + assert(useMaskedInterleavedAccesses(TTI) && + "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - return isa(I) ? TTI.isLegalMaskedLoad(Ty) + return isa(I) ? TTI.isLegalMaskedLoad(Ty) : TTI.isLegalMaskedStore(Ty); } @@ -4602,9 +4646,12 @@ // Record that scalar epilogue is not allowed. LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + IsScalarEpilogueAllowed = !OptForSize; + // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue. - InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + InterleaveInfo.invalidateGroupsRequiringScalarEpilogue( + useMaskedInterleavedAccesses(TTI)); unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); @@ -5488,13 +5535,15 @@ } // Calculate the cost of the whole interleaved group. + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I)); + Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); Index: test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll =================================================================== --- test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -84,26 +84,44 @@ ; Exactly the same scenario except we are now optimizing for size, therefore ; we check that no scalar epilogue is created. Since we can't create an epilog -; the interleave-group is invalidated because is has gaps, so we end up -; scalarizing. +; we need the ability to mask out the gaps. +; When enable-masked-interleaved-access is disabled the interleave-group is +; invalidated, so we end up scalarizing. ; (Before the fix that this test checks, we used to create an epilogue despite ; optsize, and vectorized the access as an interleaved-group. This is now fixed, ; and we make sure that a scalar epilogue does not exist). +; When enable-masked-interleaved-access is enabled, the interleave-groups will +; be vectorized with masked wide-loads with the mask properly shuffled and +; And-ed with the gaps mask. ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: entry: +;ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]]) +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +;ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]] +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: ret void + define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { entry: @@ -138,12 +156,15 @@ ; remainder loop into the main loop using masking) together with interleaved- ; groups. ; When masked-interleave-group is disabled the interleave-groups will be -; invalidated during Legality checks; -; When masked-interleave-group is enabled the interleave-groups will be -; invalidated during cost-model checks, because we don't have a way to support -; interleave-groups with gaps that require an epilogue using masking. -; So in both cases we check for no epilogue and scalarized conditional accesses. - +; invalidated during Legality checks; So there we check for no epilogue +; and for scalarized conditional accesses. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The mask itself is an And of two masks: one that masks away the remainder +; iterations, and one that masks away the 'else' of the 'if' statement. +; The shuffled mask is also And-ed with the gaps mask. +; ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p, ; unsigned char* restrict q, ; unsigned char guard, @@ -178,21 +199,39 @@ ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; ENABLED_MASKED_STRIDED: pred.load.if: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]] ; ENABLED_MASKED_STRIDED-NOT: for.body: ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void @@ -232,16 +271,16 @@ } -; Same, but the load/store are not predicated. The interleave-group is -; invalidated here as well because we have gaps and we can't create an epilog. -; The access is thus scalarized. +; Same, but the load/store are not predicated. +; When enable-masked-interleaved-access is disabled, the interleave-groups will +; be invalidated during cost-model checks because we have gaps and we can't +; create an epilog. The access is thus scalarized. ; (Before the fix that this test checks, we used to create an epilogue despite ; optsize, and vectorized the access as an interleaved-group. This is now fixed, ; and we make sure that a scalar epilogue does not exist). -; Since enable-masked-interleaved-accesses currently only affects predicated -; accesses, the behavior is the same with this switch set/unset. - - +; When enable-masked-interleaved-access is enabled, the interleave-groups will +; be vectorized with masked wide-loads (masking away the gaps). +; ; void unconditional_strided1_optsize(const unsigned char* restrict p, ; unsigned char* restrict q, ; unsigned char guard) { @@ -259,11 +298,25 @@ ;DISABLED_MASKED_STRIDED: for.end: ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0 -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: entry: +;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> , <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1 +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]] +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: ret void + define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { entry: @@ -289,13 +342,17 @@ ; Unconditioal accesses with gaps under Optsize scenario again, with unknown ; trip-count this time, in order to check the behavior of folding-the-tail ; (folding the remainder loop into the main loop using masking) together with -; interleaved-groups. -; The interleave-groups will be invalidated during cost-model checks, because -; we don't have a way to support interleave-groups with gaps that require an -; epilogue using masking (even when interleaved-masking is enabled; this -; is not yet supported). -; So we check for no epilogue and for scalarized conditional accesses. - +; interleaved-groups. Folding-the-tail turns the accesses to conditional which +; requires proper masking. In addition we need to mask out the gaps (all +; because we are not allowed to use an epilog due to optsize). +; When enable-masked-interleaved-access is disabled, the interleave-groups will +; be invalidated during cost-model checks. So there we check for no epilogue +; and for scalarized conditional accesses. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The shuffled mask is also And-ed with the gaps mask. +; ; for(ix=0; ix < n; ++ix) { ; char t = p[2*ix]; ; q[ix] = t; @@ -319,21 +376,36 @@ ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void - ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; ENABLED_MASKED_STRIDED: pred.load.if: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]] ; ENABLED_MASKED_STRIDED-NOT: for.body: ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void