Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -587,6 +587,10 @@ /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + /// Enable matching of interleaved access groups that contain predicated + /// accesses and are vectorized using masked vector loads/stores. + bool enableMaskedInterleavedAccessVectorization() const; + /// Indicate that it is potentially unsafe to automatically vectorize /// floating-point operations because the semantics of vector and scalar /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math @@ -821,9 +825,11 @@ /// load allows gaps) /// \p Alignment is the alignment of the memory operation /// \p AddressSpace is address space of the pointer. + /// \p IsMasked indicates if the memory access is predicated. int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) const; + unsigned AddressSpace, + bool IsMasked = false) const; /// Calculate the cost of performing a vector reduction. /// @@ -1072,6 +1078,7 @@ virtual const MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -1132,7 +1139,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) = 0; + unsigned AddressSpace, + bool IsMasked = false) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, @@ -1346,6 +1354,9 @@ bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool enableMaskedInterleavedAccessVectorization() override { + return Impl.enableMaskedInterleavedAccessVectorization(); + } bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } @@ -1471,9 +1482,9 @@ } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) override { + unsigned AddressSpace, bool IsMasked) override { return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -313,6 +313,8 @@ bool enableInterleavedAccessVectorization() { return false; } + bool enableMaskedInterleavedAccessVectorization() { return false; } + bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -450,8 +452,8 @@ unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { return 1; } Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,21 @@ /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); +/// Create a mask with replicated elements. +/// +/// This function creates a shuffle mask for replicating each of the \p VF +/// elements in a vector \p ReplicationFactor times. It can be used to +/// transform a mask of \p VF elements into a mask of +/// \p VF * \p ReplicationFactor elements used by a predicated +/// interleaved-group of loads/stores whose Interleaved-factor == +/// \p ReplicationFactor. +/// +/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: +/// +/// <0,0,0,1,1,1,2,2,2,3,3,3> +Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, + unsigned VF); + /// Create an interleave shuffle mask. /// /// This function creates a shuffle mask for interleaving \p NumVecs vectors of @@ -328,7 +343,7 @@ InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT, LoopInfo *LI, const LoopAccessInfo *LAI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { SmallPtrSet DelSet; @@ -341,7 +356,9 @@ /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(); + /// Consider also predicated loads/stores in the analysis if + /// \p EnableMaskedInterleavedGroup is true. + void analyzeInterleaving(bool EnableMaskedInterleavedGroup); /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -783,8 +783,8 @@ unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { VectorType *VT = dyn_cast(VecTy); assert(VT && "Expect a vector type for interleaved memory op"); @@ -795,8 +795,13 @@ VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); // Firstly, the cost of load/store operation. - unsigned Cost = static_cast(this)->getMemoryOpCost( - Opcode, VecTy, Alignment, AddressSpace); + unsigned Cost; + if (IsMasked) + Cost = static_cast(this)->getMaskedMemoryOpCost( + Opcode, VecTy, Alignment, AddressSpace); + else + Cost = static_cast(this)->getMemoryOpCost(Opcode, VecTy, Alignment, + AddressSpace); // Legalize the vector type, and get the legalized and unlegalized type // sizes. @@ -892,6 +897,31 @@ ->getVectorInstrCost(Instruction::InsertElement, VT, i); } + if (!IsMasked) + return Cost; + + Type *I8Type = Type::getInt8Ty(VT->getContext()); + VectorType *MaskVT = VectorType::get(I8Type, NumElts); + SubVT = VectorType::get(I8Type, NumSubElts); + + // The Mask shuffling cost is extract all the elements of the Mask + // and insert each of them Factor times into the wide vector: + // + // E.g. an interleaved group with factor 3: + // %mask = icmp ult <8 x i32> %vec1, %vec2 + // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, + // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> + // The cost is estimated as extract all mask elements from the <8xi1> mask + // vector and insert them factor times into the <24xi1> shuffled mask + // vector. + for (unsigned i = 0; i < NumSubElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::ExtractElement, SubVT, i); + + for (unsigned i = 0; i < NumElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::InsertElement, MaskVT, i); + return Cost; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -268,6 +268,10 @@ return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { + return TTIImpl->enableMaskedInterleavedAccessVectorization(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } @@ -515,9 +519,9 @@ int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost( + Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -502,6 +502,16 @@ return Inst; } +Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, + unsigned ReplicationFactor, unsigned VF) { + SmallVector MaskVec; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < ReplicationFactor; j++) + MaskVec.push_back(Builder.getInt32(i)); + + return ConstantVector::get(MaskVec); +} + Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs) { SmallVector Mask; @@ -672,7 +682,8 @@ // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving() { +void InterleavedAccessInfo::analyzeInterleaving( + bool EnablePredicatedInterleavedMemAccesses) { LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); const ValueToValueMap &Strides = LAI->getSymbolicStrides(); @@ -712,9 +723,8 @@ // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - // TODO: Ignore B if it is in a predicated block. This restriction can be - // relaxed in the future once we handle masked interleaved groups. - if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { + if (isStrided(DesB.Stride) && + (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B @@ -808,11 +818,12 @@ if (DistanceToB % static_cast(DesB.Size)) continue; - // Ignore A if either A or B is in a predicated block. Although we - // currently prevent group formation for predicated accesses, we may be - // able to relax this limitation in the future once we handle more - // complicated blocks. - if (isPredicated(A->getParent()) || isPredicated(B->getParent())) + // All members of a predicated interleave-group must have the same predicate, + // and currently must reside in the same BB. + BasicBlock *BlockA = A->getParent(); + BasicBlock *BlockB = B->getParent(); + if ((isPredicated(BlockA) || isPredicated(BlockB)) && + (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB)) continue; // The index of A is the index of B plus A's distance to B in multiples Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,7 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -659,11 +659,12 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -676,7 +677,7 @@ } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,7 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -542,14 +542,16 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && + !IsMasked) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -562,7 +564,7 @@ } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, Index: lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,7 @@ bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) { - if (Indices.size() != Factor) + unsigned Alignment, unsigned AddressSpace, bool IsMasked) { + if (Indices.size() != Factor || IsMasked) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -90,7 +90,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, + bool IsMasked = false); /// @} }; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -473,7 +473,12 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + Bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); + assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,7 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); /// @} }; Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -883,7 +883,11 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -101,13 +101,16 @@ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getIntImmCost(int64_t); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2719,7 +2719,12 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2828,7 +2833,12 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2946,7 +2956,8 @@ unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2958,11 +2969,11 @@ }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,10 @@ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +static cl::opt EnableMaskedInterleavedMemAccesses( + "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); + /// We don't interleave loops with a known constant trip count below this /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; @@ -408,8 +412,10 @@ /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the interleaved access group that \p Instr belongs to, + /// optionally masking the vector operations if \p BlockInMask is non-null. + void vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -1112,6 +1118,11 @@ /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction in an interleaved-group + /// of memory accesses that can be vectorized with wide vector loads/stores + /// and shuffles. + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); @@ -1946,7 +1957,8 @@ // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -1968,6 +1980,15 @@ SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); + VectorParts Mask; + bool IsMaskRequired = BlockInMask; + if (IsMaskRequired) { + Mask = *BlockInMask; + // TODO: extend the masked interleaved-group support to reversed access. + assert(!Group->isReverse() && "Reversed masked interleave-group " + "not supported."); + } + // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2011,8 +2032,19 @@ // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - auto *NewLoad = Builder.CreateAlignedLoad( - NewPtrs[Part], Group->getAlignment(), "wide.vec"); + Instruction *NewLoad; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + ShuffledMask, UndefValue::get(VecTy), + "wide.masked.vec"); + } + else + NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2079,8 +2111,18 @@ Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr = - Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + Instruction *NewStoreInstr; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewStoreInstr = Builder.CreateMaskedStore( + IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + } + else + NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -4234,6 +4276,32 @@ return false; } +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) + return TTI.enableMaskedInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + return EnableMaskedInterleavedMemAccesses; +} + +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { + assert(isAccessInterleaved(I) && "Expecting interleaved access."); + assert(getWideningDecision(I, VF) == CM_Unknown && + "Decision should not be set yet."); + + if (!Legal->blockNeedsPredication(I->getParent()) || + !Legal->isMaskRequired(I)) + return true; + + if (!useMaskedInterleavedAccesses(TTI)) + return false; + + auto *Ty = getMemInstValueType(I); + return isa(I) ? TTI.isLegalMaskedLoad(Ty) + : TTI.isLegalMaskedStore(Ty); +} + bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -5353,13 +5421,17 @@ } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, - Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS, Legal->isMaskRequired(I)); + + if (Group->isReverse()) { + // TODO: Add support for reversed masked interleaved access. + assert(!Legal->isMaskRequired(I) && + "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + } return Cost; } @@ -5461,7 +5533,8 @@ continue; NumAccesses = Group->getNumMembers(); - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, VF)) + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6134,7 +6207,8 @@ } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { + VFRange &Range, + VPlanPtr &Plan) { const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6156,7 +6230,11 @@ assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - return new VPInterleaveRecipe(IG); + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPInterleaveRecipe(IG, Mask); } VPWidenMemoryInstructionRecipe * @@ -6424,7 +6502,7 @@ VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { VPBB->appendRecipe(Recipe); return true; } @@ -6651,6 +6729,10 @@ O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -6713,7 +6795,15 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + if (!User) + return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7011,9 +7101,8 @@ UseInterleaved = EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. - if (UseInterleaved) { - IAI.analyzeInterleaving(); - } + if (UseInterleaved) + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); // Use the cost model. LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, Index: lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- lib/Transforms/Vectorize/VPRecipeBuilder.h +++ lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,7 +69,8 @@ /// \return value is , as it is handled by another recipe. /// \p Range.End may be decreased to ensure same decision from \p Range.Start /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes Index: lib/Transforms/Vectorize/VPlan.h =================================================================== --- lib/Transforms/Vectorize/VPlan.h +++ lib/Transforms/Vectorize/VPlan.h @@ -769,10 +769,14 @@ class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; + std::unique_ptr User; public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG) { + if (Mask) // Create a VPInstruction to register as a user of the mask. + User.reset(new VPUser({Mask})); + } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. Index: test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll =================================================================== --- test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -0,0 +1,164 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +; When masked-interleaved-groups are disabled: +; Check that the predicated load is not vectorized as an +; interleaved-group but rather as a scalarized accesses. +; (For SKX, Gather is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load for an interleave-group (with +; a single member). +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } +; } + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.09, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.09, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.09, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; Check also a scenario with full interleave-groups (no gaps) as well as both +; load and store groups. We check that when masked-interleave-group is disabled +; the predicated loads (and stores) are not vectorized as an +; interleaved-group but rather as four separate scalarized accesses. +; (For SKX, gather/scatter is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load/store for the two interleave- +; groups. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask) + +; Function Attrs: norecurse nounwind +define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} Index: test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll +++ test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll @@ -0,0 +1,222 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + +; We test here that the loop-vectorizer forms an interleave-groups from +; predicated memory accesses only if they are both in the same (predicated) +; block (first scenario below). +; If the accesses are not in the same predicated block, an interleave-group +; is not formed (scenarios 2,3 below). + +; Scenario 1: Check the case where it is legal to create masked interleave- +; groups. Altogether two groups are created (one for loads and one for stores) +; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses +; are disabled we do not create any interleave-group. +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1 + +; Scenario 2: Check the case where it is illegal to create a masked interleave- +; group because the first access is predicated, and the second isn't. +; We therefore create a separate interleave-group with gaps for each of the +; stores (if masked-interleaved-accesses are enabled) and these are later +; invalidated because interleave-groups of stores with gaps are not supported. +; If masked-interleaved-accesses is not enabled we create only one interleave +; group of stores (for the non-predicated store) and it is later invalidated +; due to gaps. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; q[2*ix+1] = 2; +; } +;} + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; Scenario 3: Check the case where it is illegal to create a masked interleave- +; group because the two accesses are in separate predicated blocks. +; We therefore create a separate interleave-group with gaps for each of the accesses, +; (which are later invalidated because interleave-groups of stores with gaps are +; not supported). +; If masked-interleaved-accesses is not enabled we don't create any interleave +; group because all accesses are predicated. +; +; void masked_strided3(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; if (ix > guard2) { +; q[2*ix+1] = 2; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; ModuleID = 'test.c' +source_filename = "test.c" +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.012, 1 + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + %cmp1 = icmp ugt i32 %ix.012, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %add = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.012, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard1 to i32 + %conv3 = zext i8 %guard2 to i32 + br label %for.body + +for.body: + %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.018, 1 + %cmp1 = icmp ugt i32 %ix.018, %conv + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + br label %if.end + +if.end: + %cmp4 = icmp ugt i32 %ix.018, %conv3 + br i1 %cmp4, label %if.then6, label %for.inc + +if.then6: + %add = or i32 %mul, 1 + %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx7, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.018, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } Index: test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" %pair = type { i64, i64 }