Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -806,6 +806,12 @@ /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + bool hasInterleavedLoad(VectorType *VecTy, Value *Addr, uint32_t Factor, + bool IsMasked) const; + + bool hasInterleavedStore(SmallVectorImpl &StoredVecs, Value *Addr, + uint32_t Factor, bool IsMasked) const; + /// Enable matching of interleaved access groups that contain predicated /// accesses or gaps and therefore vectorized using masked /// vector loads/stores. @@ -1671,6 +1677,11 @@ virtual MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool hasInterleavedLoad(VectorType *VecTy, Value *Addr, + uint32_t Factor, bool IsMasked) = 0; + virtual bool hasInterleavedStore(SmallVectorImpl &StoredVecs, + Value *Addr, uint32_t Factor, + bool IsMasked) = 0; virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -2156,6 +2167,14 @@ bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool hasInterleavedLoad(VectorType *VecTy, Value *Addr, uint32_t Factor, + bool IsMasked) override { + return Impl.hasInterleavedLoad(VecTy, Addr, Factor, IsMasked); + } + bool hasInterleavedStore(SmallVectorImpl &StoredVecs, Value *Addr, + uint32_t Factor, bool IsMasked) override { + return Impl.hasInterleavedStore(StoredVecs, Addr, Factor, IsMasked); + } bool enableMaskedInterleavedAccessVectorization() override { return Impl.enableMaskedInterleavedAccessVectorization(); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -358,6 +358,16 @@ bool enableInterleavedAccessVectorization() const { return false; } + bool hasInterleavedLoad(VectorType *VecTy, Value *Addr, uint32_t Factor, + bool IsMasked) const { + return false; + } + + bool hasInterleavedStore(SmallVectorImpl &StoredVecs, Value *Addr, + uint32_t Factor, bool IsMasked) const { + return false; + } + bool enableMaskedInterleavedAccessVectorization() const { return false; } bool isFPVectorizationPotentiallyUnsafe() const { return false; } Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2853,6 +2853,12 @@ return false; } + /// Lower an interleaved load to target specific intrinsics. Return + /// true on success. + /// + /// \p LI is the vector interleaved load intrinsic. + virtual bool lowerInterleavedLoad(IntrinsicInst *LI) const { return false; } + /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// @@ -2864,6 +2870,12 @@ return false; } + /// Lower an interleaved store to target specific intrinsics. Return + /// true on success. + /// + /// \p SI is the vector interleaved store intrinsic. + virtual bool lowerInterleavedStore(IntrinsicInst *SI) const { return false; } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -764,6 +764,17 @@ /// Create a call to llvm.threadlocal.address intrinsic. CallInst *CreateThreadLocalAddress(Value *Ptr); + /// Create a call to a Masked Interleaved Load intrinsic. + CallInst *CreateMaskedInterleavedLoad(uint32_t Factor, Type *Ty, Value *Ptr, + Align Alignment, Value *Mask = nullptr, + Value *PassThru = nullptr, + const Twine &Name = ""); + + /// Create a call to a Masked Interleaved Store intrinsic. + CallInst *CreateMaskedInterleavedStore(uint32_t Factor, ArrayRef Val, + Value *Ptr, Align Alignment, + Value *Mask = nullptr); + /// Create a call to Masked Load intrinsic CallInst *CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru = nullptr, const Twine &Name = ""); Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1753,6 +1753,41 @@ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [IntrWriteMem, IntrWillReturn, ImmArg>]>; +def int_experimental_masked_interleaved2_load: + DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrReadMem, IntrArgMemOnly]>; + +def int_experimental_masked_interleaved3_load: + DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrReadMem, IntrArgMemOnly]>; + +def int_experimental_masked_interleaved4_load: + DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], + [LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrReadMem, IntrArgMemOnly]>; + +def int_experimental_masked_interleaved2_store: + DefaultAttrsIntrinsic<[], + [llvm_anyvector_ty, LLVMMatchType<0>, + LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], + [IntrWriteMem, IntrWillReturn, ImmArg>]>; + +def int_experimental_masked_interleaved3_store: + DefaultAttrsIntrinsic<[], + [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], + [IntrWriteMem, IntrWillReturn, ImmArg>]>; + +def int_experimental_masked_interleaved4_store: + DefaultAttrsIntrinsic<[], + [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMPointerToElt<0>, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], + [IntrWriteMem, IntrWillReturn, ImmArg>]>; + def int_masked_expandload: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMPointerToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -546,6 +546,18 @@ return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::hasInterleavedLoad(VectorType *VecTy, Value *Addr, + uint32_t Factor, + bool IsMasked) const { + return TTIImpl->hasInterleavedLoad(VecTy, Addr, Factor, IsMasked); +} + +bool TargetTransformInfo::hasInterleavedStore( + SmallVectorImpl &StoredVecs, Value *Addr, uint32_t Factor, + bool IsMasked) const { + return TTIImpl->hasInterleavedStore(StoredVecs, Addr, Factor, IsMasked); +} + bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { return TTIImpl->enableMaskedInterleavedAccessVectorization(); } Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -113,6 +114,10 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + /// Transform an interleaved llvm intrinsic into target specific intrinsics. + bool lowerInterleavedIntrinsic(IntrinsicInst *SI, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -517,6 +522,28 @@ return true; } +bool InterleavedAccess::lowerInterleavedIntrinsic( + IntrinsicInst *II, SmallVector &DeadInsts) { + switch (II->getIntrinsicID()) { + case Intrinsic::experimental_masked_interleaved2_load: + case Intrinsic::experimental_masked_interleaved3_load: + case Intrinsic::experimental_masked_interleaved4_load: + if (!TLI->lowerInterleavedLoad(II)) + return false; + break; + case Intrinsic::experimental_masked_interleaved2_store: + case Intrinsic::experimental_masked_interleaved3_store: + case Intrinsic::experimental_masked_interleaved4_store: + if (!TLI->lowerInterleavedStore(II)) + return false; + break; + default: + return false; + } + DeadInsts.push_back(II); + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -537,6 +564,9 @@ if (auto *LI = dyn_cast(&I)) Changed |= lowerInterleavedLoad(LI, DeadInsts); + if (auto *II = dyn_cast(&I)) + Changed |= lowerInterleavedIntrinsic(II, DeadInsts); + if (auto *SI = dyn_cast(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); } Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -576,6 +576,110 @@ return CreateCall(FnIntrinsic, {Scope}); } +/// Create a call to a Masked Interleaved Load intrinsic. +/// \p Ty - vector type to load +/// \p Ptr - base pointer for the load +/// \p Alignment - alignment of the source location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p PassThru - pass-through value that is used to fill the masked-off lanes +/// of the result +/// \p Name - name of the result variable +/// \p Factor - interleaving factor +CallInst *IRBuilderBase::CreateMaskedInterleavedLoad( + uint32_t Factor, Type *Ty, Value *Ptr, Align Alignment, Value *Mask, + Value *PassThru, const Twine &Name) { + assert(Ty->isVectorTy() && "Type should be vector"); + auto *PtrTy = cast(Ptr->getType()); + assert(PtrTy->isOpaqueOrPointeeTypeMatches(Ty) && "Wrong element type"); + + auto *VecTy = cast(Ty); + ElementCount NumElts = VecTy->getElementCount().divideCoefficientBy(Factor); + auto *LDVTy = + VectorType::get(VecTy->getElementType(), + VecTy->getElementCount().divideCoefficientBy(Factor)); + if (!Mask) + Mask = Constant::getAllOnesValue( + VectorType::get(Type::getInt1Ty(Context), NumElts)); + if (!PassThru) + PassThru = UndefValue::get(LDVTy); + + auto *PtrVecTy = LDVTy->getElementType()->getPointerTo( + Ptr->getType()->getPointerAddressSpace()); + Type *OverloadedTypes[] = {LDVTy}; + Value *Ops[] = {this->CreateBitCast(Ptr, PtrVecTy), + getInt32(Alignment.value()), Mask, PassThru}; + switch (Factor) { + case 2: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved2_load, Ops, OverloadedTypes, + Name); + case 3: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved3_load, Ops, OverloadedTypes, + Name); + case 4: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved4_load, Ops, OverloadedTypes, + Name); + default: + break; + } + return nullptr; +} + +/// Create a call to a Masked Interleaved Store intrinsic. +/// \p StoredVals - data to be stored +/// \p Ptr - base pointer for the store +/// \p Alignment - alignment of the destination location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p Factor - interleaving factor +CallInst *IRBuilderBase::CreateMaskedInterleavedStore( + uint32_t Factor, ArrayRef StoredVals, Value *Ptr, Align Alignment, + Value *Mask) { + assert(StoredVals.size() == Factor && + "Not enough data to store for given factor"); + Type *DataTy = (*StoredVals.begin())->getType(); +#ifndef NDEBUG + for (auto &Val : StoredVals) + assert(Val->getType()->isVectorTy() && "Stored value should be a vector"); +#endif + auto *VecTy = cast(DataTy); + auto *PtrTy = VecTy->getElementType()->getPointerTo( + Ptr->getType()->getPointerAddressSpace()); + assert(PtrTy->isOpaqueOrPointeeTypeMatches(VecTy->getElementType()) && + "Wrong element type"); + + ElementCount NumElts = VecTy->getElementCount(); + if (!Mask) + Mask = Constant::getAllOnesValue( + VectorType::get(Type::getInt1Ty(Context), NumElts)); + + Type *OverloadedTypes[] = {DataTy}; + SmallVector Ops(StoredVals.begin(), StoredVals.end()); + Ops.append( + {this->CreateBitCast(Ptr, PtrTy), getInt32(Alignment.value()), Mask}); + + switch (Factor) { + case 2: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved2_store, Ops, + OverloadedTypes); + case 3: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved3_store, Ops, + OverloadedTypes); + case 4: + return CreateMaskedIntrinsic( + Intrinsic::experimental_masked_interleaved4_store, Ops, + OverloadedTypes); + default: + break; + } + return nullptr; +} + /// Create a call to a Masked Load intrinsic. /// \p Ty - vector type to load /// \p Ptr - base pointer for the load Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -620,8 +620,10 @@ ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; + bool lowerInterleavedLoad(IntrinsicInst *LI) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerInterleavedStore(IntrinsicInst *SI) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13490,8 +13490,11 @@ /// will generate when lowering accesses of the given type. unsigned AArch64TargetLowering::getNumInterleavedAccesses( VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { + auto EC = VecTy->getElementCount(); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128; - return std::max(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); + return std::max(1, + (EC.getKnownMinValue() * ElSize + 127) / VecSize); } MachineMemOperand::Flags @@ -13505,24 +13508,30 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { - unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); - unsigned NumElements = cast(VecTy)->getNumElements(); + auto EC = VecTy->getElementCount(); UseScalable = false; // Ensure the number of vector elements is greater than 1. - if (NumElements < 2) + if (EC.getKnownMinValue() < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; + if (EC.isScalable()) { + if (EC.getKnownMinValue() * ElSize == 128) + return true; + return false; + } + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (Subtarget->useSVEForFixedLengthVectors() && (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || (VecSize < Subtarget->getMinSVEVectorSizeInBits() && - isPowerOf2_32(NumElements) && VecSize > 128))) { + isPowerOf2_32(EC.getKnownMinValue()) && VecSize > 128))) { UseScalable = true; return true; } @@ -13718,6 +13727,27 @@ return true; } +bool AArch64TargetLowering::lowerInterleavedLoad(IntrinsicInst *LI) const { + if (!Subtarget->hasSVE()) + return false; + IRBuilder<> Builder(LI); + static const llvm::Intrinsic::ID SVELoadIntrs[3] = { + Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, + Intrinsic::aarch64_sve_ld4_sret}; + auto *RetTy = dyn_cast(LI->getType()); + if (!RetTy) + return false; + uint64_t Factor = RetTy->getNumElements(); + if (Factor > 4 || Factor < 2) + return false; + auto *LdNFunc = Intrinsic::getDeclaration( + LI->getModule(), SVELoadIntrs[Factor - 2], {RetTy->getElementType(0)}); + auto *AArch64LI = + Builder.CreateCall(LdNFunc, {LI->getOperand(2), LI->getOperand(0)}); + LI->replaceAllUsesWith(AArch64LI); + return true; +} + /// Lower an interleaved store into a stN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -13899,6 +13929,35 @@ return true; } +bool AArch64TargetLowering::lowerInterleavedStore(IntrinsicInst *SI) const { + if (!Subtarget->hasSVE()) + return false; + IRBuilder<> Builder(SI); + static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2, + Intrinsic::aarch64_sve_st3, + Intrinsic::aarch64_sve_st4}; + auto *VecTy = SI->getOperand(0)->getType(); + if (!VecTy) + return false; + unsigned Factor = 0; + for (auto &Arg : SI->args()) { + if (!Arg->getType()->isVectorTy()) + break; + Factor++; + } + + if (Factor > 4 || Factor < 2) + return false; + auto *StNFunc = Intrinsic::getDeclaration(SI->getModule(), + SVEStoreIntrs[Factor - 2], {VecTy}); + SmallVector Ops(SI->arg_begin(), SI->arg_begin() + Factor); + Ops.push_back(SI->getOperand(Factor + 2)); // mask + Ops.push_back(SI->getOperand(Factor)); // addr + auto *AArch64SI = Builder.CreateCall(StNFunc, Ops); + SI->replaceAllUsesWith(AArch64SI); + return true; +} + EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -96,7 +96,13 @@ /// \name Vector TTI Implementations /// @{ - bool enableInterleavedAccessVectorization() { return true; } + bool enableInterleavedAccessVectorization(); + + bool hasInterleavedLoad(VectorType *VecTy, Value *Addr, uint32_t Factor, + bool IsMasked); + + bool hasInterleavedStore(SmallVectorImpl &StoredVecs, Value *Addr, + uint32_t Factor, bool IsMasked); unsigned getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -38,6 +38,34 @@ static cl::opt SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); +cl::opt EnableSVEInterleavedMemAccesses( + "enable-sve-interleaved-mem-accesses", cl::init(true), cl::Hidden, + cl::desc("Enable vectorization on interleaved memory accesses in a loop " + "using sve load/store.")); + +bool AArch64TTIImpl::enableInterleavedAccessVectorization() { + return !ST->hasSVE() || EnableSVEInterleavedMemAccesses; +} + +bool AArch64TTIImpl::hasInterleavedLoad(VectorType *VecTy, Value *Addr, + uint32_t Factor, bool IsMasked) { + if (!enableInterleavedAccessVectorization() || + !isa(VecTy)) + return false; + if (Factor < 1 || Factor > 4) + return false; + + return true; +} + +bool AArch64TTIImpl::hasInterleavedStore(SmallVectorImpl &StoredVecs, + Value *Addr, uint32_t Factor, + bool IsMasked) { + return hasInterleavedLoad( + dyn_cast((*StoredVecs.begin())->getType()), Addr, Factor, + IsMasked); +} + class TailFoldingKind { private: uint8_t Bits = 0; // Currently defaults to disabled. @@ -2364,22 +2392,28 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); - auto *VecVTy = cast(VecTy); - + auto *VecVTy = cast(VecTy); if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - unsigned NumElts = VecVTy->getNumElements(); + unsigned NumElts = VecVTy->getElementCount().getKnownMinValue(); auto *SubVecTy = - FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one ldN/stN instruction. bool UseScalable; - if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); + if (NumElts % Factor == 0) { + if (TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) + return Factor * + TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); + } } + // Calling the base implementation should only happen + // when compiler wants to scalarize the operation. + if (isa(VecTy)) + return InstructionCost::getInvalid(); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, CostKind, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2556,7 +2556,6 @@ // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2567,14 +2566,21 @@ assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); + Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + if (Group->isReverse()) { + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); + Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); + Idx = Builder.CreateNeg(Idx); + } else + Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); @@ -2595,7 +2601,7 @@ bool InBounds = false; if (auto *gep = dyn_cast(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx); cast(AddrPart)->setIsInBounds(InBounds); // Cast to the vector pointer type. @@ -2637,10 +2643,17 @@ NewLoad = Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); + } else { + // Check if we can create target specific interleaving load. + // We can not express alignment so just drop it for now. + if (TTI->hasInterleavedLoad(VecTy, AddrParts[Part], Group->getFactor(), + false)) + NewLoad = Builder.CreateMaskedInterleavedLoad( + Group->getFactor(), VecTy, AddrParts[Part], Group->getAlign()); + else + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], + Group->getAlign(), "wide.vec"); } - else - NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlign(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2658,12 +2671,15 @@ auto StrideMask = createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { - Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); + Value *StridedVec; + if (NewLoads[Part]->getType()->isAggregateType()) + StridedVec = Builder.CreateExtractValue(NewLoads[Part], I); + else + StridedVec = Builder.CreateShuffleVector(NewLoads[Part], StrideMask, + "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2715,6 +2731,17 @@ StoredVecs.push_back(StoredVec); } + // Check if we can create target specific interleaving store. + // We can not express alignment so just drop it for now. + if (TTI->hasInterleavedStore(StoredVecs, AddrParts[Part], + Group->getFactor(), false)) { + CallInst *Store = Builder.CreateMaskedInterleavedStore( + Group->getFactor(), StoredVecs, AddrParts[Part], Group->getAlign()); + + // create interleaved store + Group->addMetadata(Store); + continue; + } // Concatenate all vectors into a wide vector. Value *WideVec = concatenateVectors(Builder, StoredVecs); @@ -2904,10 +2931,11 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast(DstVTy); - unsigned VF = DstFVTy->getNumElements(); - auto *SrcVecTy = cast(V->getType()); - assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); + auto *DstFVTy = cast(DstVTy); + auto VF = DstFVTy->getElementCount(); + auto *SrcVecTy = cast(V->getType()); + assert((VF == SrcVecTy->getElementCount()) && + "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && @@ -2927,7 +2955,7 @@ "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = FixedVectorType::get(IntTy, VF); + auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } @@ -6483,11 +6511,6 @@ InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - // TODO: Once we have support for interleaving with scalable vectors - // we can calculate the cost properly here. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); Index: llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses-load.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses-load.ll @@ -0,0 +1,361 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON +; RUN: opt < %s -mattr=+sve -interleaved-access -S | FileCheck %s -check-prefix=SVE + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; ld2b +define { , } @ld2.nxv16i8( %Pg, i8 *%addr) { +; NEON-LABEL: @ld2.nxv16i8( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv16i8(i8* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv16i8( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( [[PG:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv16i8(i8 *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +; ld2h +define { , } @ld2.nxv8i16( %Pg, i16 *%addr) { +; NEON-LABEL: @ld2.nxv8i16( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv8i16(i16* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv8i16( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( [[PG:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv8i16(i16 *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +define { , } @ld2.nxv8f16( %Pg, half *%addr) { +; NEON-LABEL: @ld2.nxv8f16( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv8f16(half* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv8f16( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8f16( [[PG:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv8f16(half *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +define { , } @ld2.nxv8bf16( %Pg, bfloat *%addr) #0 { +; NEON-LABEL: @ld2.nxv8bf16( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv8bf16(bfloat* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv8bf16( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8bf16( [[PG:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv8bf16(bfloat *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +; ld2w +define { , } @ld2.nxv4i32( %Pg, i32 *%addr) { +; NEON-LABEL: @ld2.nxv4i32( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv4i32( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( [[PG:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32 *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +define { , } @ld2.nxv4f32( %Pg, float *%addr) { +; NEON-LABEL: @ld2.nxv4f32( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32(float* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv4f32( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( [[PG:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32(float *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +; ld2d +define { , } @ld2.nxv2i64( %Pg, i64 *%addr) { +; NEON-LABEL: @ld2.nxv2i64( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv2i64(i64* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv2i64( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PG:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv2i64(i64 *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +define { , } @ld2.nxv2f64( %Pg, double *%addr) { +; NEON-LABEL: @ld2.nxv2f64( +; NEON-NEXT: [[RES:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv2f64(double* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , } [[RES]] +; +; SVE-LABEL: @ld2.nxv2f64( +; SVE-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( [[PG:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret { , } [[LDN]] +; + %res = call { , } @llvm.experimental.masked.interleaved2.load.nxv2f64(double *%addr, i32 0, %Pg, undef) + ret { , } %res +} + +; ld3b +define { , , } @ld3.nxv16i8( %Pg, i8 *%addr) { +; NEON-LABEL: @ld3.nxv16i8( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv16i8(i8* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv16i8( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv16i8( [[PG:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv16i8(i8 *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +; ld3h +define { , , } @ld3.nxv8i16( %Pg, i16 *%addr) { +; NEON-LABEL: @ld3.nxv8i16( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8i16(i16* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv8i16( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv8i16( [[PG:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8i16(i16 *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +define { , , } @ld3.nxv8f16( %Pg, half *%addr) { +; NEON-LABEL: @ld3.nxv8f16( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8f16(half* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv8f16( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv8f16( [[PG:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8f16(half *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +define { , , } @ld3.nxv8bf16( %Pg, bfloat *%addr) #0 { +; NEON-LABEL: @ld3.nxv8bf16( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8bf16(bfloat* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv8bf16( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv8bf16( [[PG:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv8bf16(bfloat *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +; ld3w +define { , , } @ld3.nxv4i32( %Pg, i32 *%addr) { +; NEON-LABEL: @ld3.nxv4i32( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4i32(i32* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv4i32( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4i32( [[PG:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4i32(i32 *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +define { , , } @ld3.nxv4f32( %Pg, float *%addr) { +; NEON-LABEL: @ld3.nxv4f32( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4f32(float* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv4f32( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4f32( [[PG:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4f32(float *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +; ld3d +define { , , } @ld3.nxv2i64( %Pg, i64 *%addr) { +; NEON-LABEL: @ld3.nxv2i64( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv2i64(i64* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv2i64( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv2i64( [[PG:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv2i64(i64 *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +define { , , } @ld3.nxv2f64( %Pg, double *%addr) { +; NEON-LABEL: @ld3.nxv2f64( +; NEON-NEXT: [[RES:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv2f64(double* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , } [[RES]] +; +; SVE-LABEL: @ld3.nxv2f64( +; SVE-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv2f64( [[PG:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , } [[LDN]] +; + %res = call { , , } @llvm.experimental.masked.interleaved3.load.nxv2f64(double *%addr, i32 0, %Pg, undef) + ret { , , } %res +} + +; ld4b +define { , , , } @ld4.nxv16i8( %Pg, i8 *%addr) { +; NEON-LABEL: @ld4.nxv16i8( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv16i8(i8* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv16i8( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv16i8( [[PG:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv16i8(i8 *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +; ld4h +define { , , , } @ld4.nxv8i16( %Pg, i16 *%addr) { +; NEON-LABEL: @ld4.nxv8i16( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8i16(i16* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv8i16( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv8i16( [[PG:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8i16(i16 *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +define { , , , } @ld4.nxv8f16( %Pg, half *%addr) { +; NEON-LABEL: @ld4.nxv8f16( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8f16(half* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv8f16( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv8f16( [[PG:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8f16(half *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +define { , , , } @ld4.nxv8bf16( %Pg, bfloat *%addr) #0 { +; NEON-LABEL: @ld4.nxv8bf16( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8bf16(bfloat* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv8bf16( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv8bf16( [[PG:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv8bf16(bfloat *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +; ld4w +define { , , , } @ld4.nxv4i32( %Pg, i32 *%addr) { +; NEON-LABEL: @ld4.nxv4i32( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv4i32(i32* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv4i32( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( [[PG:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv4i32(i32 *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +define { , , , } @ld4.nxv4f32( %Pg, float *%addr) { +; NEON-LABEL: @ld4.nxv4f32( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv4f32(float* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv4f32( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4f32( [[PG:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv4f32(float *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +; ld4d +define { , , , } @ld4.nxv2i64( %Pg, i64 *%addr) { +; NEON-LABEL: @ld4.nxv2i64( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv2i64(i64* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv2i64( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[PG:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv2i64(i64 *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +define { , , , } @ld4.nxv2f64( %Pg, double *%addr) { +; NEON-LABEL: @ld4.nxv2f64( +; NEON-NEXT: [[RES:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv2f64(double* [[ADDR:%.*]], i32 0, [[PG:%.*]], undef) +; NEON-NEXT: ret { , , , } [[RES]] +; +; SVE-LABEL: @ld4.nxv2f64( +; SVE-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( [[PG:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret { , , , } [[LDN]] +; + %res = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv2f64(double *%addr, i32 0, %Pg, undef) + ret { , , , } %res +} + +declare { , } @llvm.experimental.masked.interleaved2.load.nxv16i8(i8*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv8i16(i16*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv2i64(i64*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv8f16(half*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv8bf16(bfloat*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv4f32(float*, i32, , ) +declare { , } @llvm.experimental.masked.interleaved2.load.nxv2f64(double*, i32, , ) + +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv16i8(i8*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv8i16(i16*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv4i32(i32*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv2i64(i64*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv8f16(half*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv8bf16(bfloat*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv4f32(float*, i32, , ) +declare { , , } @llvm.experimental.masked.interleaved3.load.nxv2f64(double*, i32, , ) + +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv16i8(i8*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv8i16(i16*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv4i32(i32*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv2i64(i64*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv8f16(half*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv8bf16(bfloat*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv4f32(float*, i32, , ) +declare { , , , } @llvm.experimental.masked.interleaved4.load.nxv2f64(double*, i32, , ) + + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+bf16" } Index: llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses-store.ll @@ -0,0 +1,491 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON +; RUN: opt < %s -mattr=+sve -interleaved-access -S | FileCheck %s -check-prefix=SVE + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; +; ST2B +; + +define void @st2.nxv16i8( %v0, %v1, %pred, i8* %addr) { +; NEON-LABEL: @st2.nxv16i8( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv16i8( [[V0:%.*]], [[V1:%.*]], i8* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2.nxv16i8( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv16i8( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv16i8( %v0, + %v1, + i8* %addr, i32 0, + %pred) + ret void +} + +; +; ST2H +; + +define void @st2.nxv8i16( %v0, %v1, %pred, i16* %addr) { +; NEON-LABEL: @st2.nxv8i16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv8i16( [[V0:%.*]], [[V1:%.*]], i16* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2.nxv8i16( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv8i16( %v0, + %v1, + i16* %addr, i32 0, + %pred) + ret void +} + +define void @st2.nxv8f16( %v0, %v1, %pred, half* %addr) { +; NEON-LABEL: @st2.nxv8f16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv8f16( [[V0:%.*]], [[V1:%.*]], half* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2.nxv8f16( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv8f16( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv8f16( %v0, + %v1, + half* %addr, i32 0, + %pred) + ret void +} + +define void @st2.nxv8bf16( %v0, %v1, %pred, bfloat* %addr) #0 { +; NEON-LABEL: @st2.nxv8bf16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv8bf16( [[V0:%.*]], [[V1:%.*]], bfloat* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2.nxv8bf16( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv8bf16( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv8bf16( %v0, + %v1, + bfloat* %addr, i32 0, + %pred) + ret void +} + +; +; ST2W +; + +define void @st2_nxv4i32( %v0, %v1, %pred, i32* %addr) { +; NEON-LABEL: @st2_nxv4i32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[V0:%.*]], [[V1:%.*]], i32* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2_nxv4i32( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv4i32( %v0, + %v1, + i32* %addr, i32 0, + %pred) + ret void +} + +define void @st2_nx4f32( %v0, %v1, %pred, float* %addr) { +; NEON-LABEL: @st2_nx4f32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4f32( [[V0:%.*]], [[V1:%.*]], float* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2_nx4f32( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv4f32( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv4f32( %v0, + %v1, + float* %addr, i32 0, + %pred) + ret void +} + +; +; ST2D +; + +define void @st2_nx2i64( %v0, %v1, %pred, i64* %addr) { +; NEON-LABEL: @st2_nx2i64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv2i64( [[V0:%.*]], [[V1:%.*]], i64* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2_nx2i64( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv2i64( %v0, + %v1, + i64* %addr, i32 0, + %pred) + ret void +} + +define void @st2_nx2f64( %v0, %v1, %pred, double* %addr) { +; NEON-LABEL: @st2_nx2f64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv2f64( [[V0:%.*]], [[V1:%.*]], double* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st2_nx2f64( +; SVE-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[V0:%.*]], [[V1:%.*]], [[PRED:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved2.store.nxv2f64( %v0, + %v1, + double* %addr, i32 0, + %pred) + ret void +} + +; +; ST3B +; + +define void @st3_nx16i8( %v0, %v1, %v2, %pred, i8* %addr) { +; NEON-LABEL: @st3_nx16i8( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv16i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], i8* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx16i8( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv16i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv16i8( %v0, + %v1, + %v2, + i8* %addr, i32 0, + %pred) + ret void +} + +; +; ST3H +; + +define void @st3_nx8i16( %v0, %v1, %v2, %pred, i16* %addr) { +; NEON-LABEL: @st3_nx8i16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv8i16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], i16* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx8i16( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv8i16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv8i16( %v0, + %v1, + %v2, + i16* %addr, i32 0, + %pred) + ret void +} + +define void @st3_nx8f16( %v0, %v1, %v2, %pred, half* %addr) { +; NEON-LABEL: @st3_nx8f16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv8f16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], half* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx8f16( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv8f16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv8f16( %v0, + %v1, + %v2, + half* %addr, i32 0, + %pred) + ret void +} + +define void @st3_nx8bf16( %v0, %v1, %v2, %pred, bfloat* %addr) #0 { +; NEON-LABEL: @st3_nx8bf16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv8bf16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], bfloat* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx8bf16( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv8bf16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv8bf16( %v0, + %v1, + %v2, + bfloat* %addr, i32 0, + %pred) + ret void +} + +; +; ST3W +; + +define void @st3_nx4i32( %v0, %v1, %v2, %pred, i32* %addr) { +; NEON-LABEL: @st3_nx4i32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv4i32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], i32* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx4i32( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv4i32( %v0, + %v1, + %v2, + i32* %addr, i32 0, + %pred) + ret void +} + +define void @st3_nx4f32( %v0, %v1, %v2, %pred, float* %addr) { +; NEON-LABEL: @st3_nx4f32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv4f32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], float* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx4f32( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv4f32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv4f32( %v0, + %v1, + %v2, + float* %addr, i32 0, + %pred) + ret void +} + +; +; ST3D +; + +define void @st3_nx2i64( %v0, %v1, %v2, %pred, i64* %addr) { +; NEON-LABEL: @st3_nx2i64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv2i64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], i64* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx2i64( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv2i64( %v0, + %v1, + %v2, + i64* %addr, i32 0, + %pred) + ret void +} + +define void @st3_nx2f64( %v0, %v1, %v2, %pred, double* %addr) { +; NEON-LABEL: @st3_nx2f64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv2f64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], double* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st3_nx2f64( +; SVE-NEXT: call void @llvm.aarch64.sve.st3.nxv2f64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[PRED:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved3.store.nxv2f64( %v0, + %v1, + %v2, + double* %addr, i32 0, + %pred) + ret void +} + +; +; ST4B +; + +define void @st4_nx16i8( %v0, %v1, %v2, %v3, %pred, i8* %addr) { +; NEON-LABEL: @st4_nx16i8( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv16i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], i8* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx16i8( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv16i8( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], i8* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv16i8( %v0, + %v1, + %v2, + %v3, + i8* %addr, i32 0, + %pred) + ret void +} + +; +; ST4H +; + +define void @st4_nx8i16( %v0, %v1, %v2, %v3, %pred, i16* %addr) { +; NEON-LABEL: @st4_nx8i16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv8i16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], i16* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx8i16( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv8i16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], i16* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv8i16( %v0, + %v1, + %v2, + %v3, + i16* %addr, i32 0, + %pred) + ret void +} + +define void @st4_nx8f16( %v0, %v1, %v2, %v3, %pred, half* %addr) { +; NEON-LABEL: @st4_nx8f16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv8f16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], half* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx8f16( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv8f16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], half* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv8f16( %v0, + %v1, + %v2, + %v3, + half* %addr, i32 0, + %pred) + ret void +} + +define void @st4_nx8bf16( %v0, %v1, %v2, %v3, %pred, bfloat* %addr) #0 { +; NEON-LABEL: @st4_nx8bf16( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv8bf16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], bfloat* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx8bf16( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv8bf16( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], bfloat* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv8bf16( %v0, + %v1, + %v2, + %v3, + bfloat* %addr, i32 0, + %pred) + ret void +} + +; +; ST4W +; + +define void @st4_nx4i32( %v0, %v1, %v2, %v3, %pred, i32* %addr) { +; NEON-LABEL: @st4_nx4i32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv4i32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], i32* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx4i32( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], i32* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv4i32( %v0, + %v1, + %v2, + %v3, + i32* %addr, i32 0, + %pred) + ret void +} + +define void @st4_nx4f32( %v0, %v1, %v2, %v3, %pred, float* %addr) { +; NEON-LABEL: @st4_nx4f32( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv4f32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], float* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx4f32( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv4f32( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], float* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv4f32( %v0, + %v1, + %v2, + %v3, + float* %addr, i32 0, + %pred) + ret void +} + +; +; ST4D +; + +define void @st4_nx2i64( %v0, %v1, %v2, %v3, %pred, i64* %addr) { +; NEON-LABEL: @st4_nx2i64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv2i64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], i64* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx2i64( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], i64* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv2i64( %v0, + %v1, + %v2, + %v3, + i64* %addr, i32 0, + %pred) + ret void +} + +define void @st4_nx2f64( %v0, %v1, %v2, %v3, %pred, double* %addr) { +; NEON-LABEL: @st4_nx2f64( +; NEON-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv2f64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], double* [[ADDR:%.*]], i32 0, [[PRED:%.*]]) +; NEON-NEXT: ret void +; +; SVE-LABEL: @st4_nx2f64( +; SVE-NEXT: call void @llvm.aarch64.sve.st4.nxv2f64( [[V0:%.*]], [[V1:%.*]], [[V2:%.*]], [[V3:%.*]], [[PRED:%.*]], double* [[ADDR:%.*]]) +; SVE-NEXT: ret void +; + call void @llvm.experimental.masked.interleaved4.store.nxv2f64( %v0, + %v1, + %v2, + %v3, + double* %addr, i32 0, + %pred) + ret void +} + +attributes #0 = { "target-features"="+bf16" } + +declare void @llvm.experimental.masked.interleaved2.store.nxv16i8(, , i8*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv8i16(, , i16*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv4i32(, , i32*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv2i64(, , i64*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv8f16(, , half*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv8bf16(, , bfloat*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv4f32(, , float*, i32, ) +declare void @llvm.experimental.masked.interleaved2.store.nxv2f64(, , double*, i32, ) + +declare void @llvm.experimental.masked.interleaved3.store.nxv16i8(, , , i8*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv8i16(, , , i16*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv4i32(, ,, i32*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv2i64(, ,, i64*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv8f16(, ,, half*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv8bf16(, ,, bfloat*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv4f32(, ,, float*, i32, ) +declare void @llvm.experimental.masked.interleaved3.store.nxv2f64(, ,, double*, i32, ) + +declare void @llvm.experimental.masked.interleaved4.store.nxv16i8(, , , , i8*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv8i16(, , , , i16*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv4i32(, ,, , i32*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv2i64(, ,, , i64*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv8f16(, ,, , half*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv8bf16(, ,, , bfloat*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv4f32(, ,, , float*, i32, ) +declare void @llvm.experimental.masked.interleaved4.store.nxv2f64(, ,, , double*, i32, ) Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1,15 +1,15 @@ ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ -; RUN: -force-ordered-reductions=false -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED +; RUN: -force-ordered-reductions=false -hints-allow-reordering=false -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ -; RUN: -force-ordered-reductions=false -hints-allow-reordering=true -S | FileCheck %s --check-prefix=CHECK-UNORDERED +; RUN: -force-ordered-reductions=false -hints-allow-reordering=true -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ -; RUN: -force-ordered-reductions=true -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: -force-ordered-reductions=true -hints-allow-reordering=false -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-ORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ -; RUN: -force-ordered-reductions=true -hints-allow-reordering=true -S | FileCheck %s --check-prefix=CHECK-UNORDERED +; RUN: -force-ordered-reductions=true -hints-allow-reordering=true -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue \ -; RUN: -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: -hints-allow-reordering=false -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-ORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -hints-allow-reordering=false -S | FileCheck %s --check-prefix=CHECK-ORDERED-TF +; RUN: -hints-allow-reordering=false -enable-sve-interleaved-mem-accesses=false -S | FileCheck %s --check-prefix=CHECK-ORDERED-TF define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_strict Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -216,27 +216,19 @@ define void @interleave(float* noalias %dst, float* noalias %src, i64 %n) #0 { ; CHECK-NOTF-LABEL: @interleave( ; CHECK-NOTF: vector.body: -; CHECK-NOTF: %[[LOAD:.*]] = load <8 x float>, <8 x float> -; CHECK-NOTF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> -; CHECK-NOTF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NOTF: %[[LOAD:.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32( ; CHECK-TF-LABEL: @interleave( ; CHECK-TF: vector.body: -; CHECK-TF: %[[LOAD:.*]] = load <8 x float>, <8 x float> -; CHECK-TF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> -; CHECK-TF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF: %[[LOAD:.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32( ; CHECK-TF-NORED-LABEL: @interleave( ; CHECK-TF-NORED: vector.body: -; CHECK-TF-NORED: %[[LOAD:.*]] = load <8 x float>, <8 x float> -; CHECK-TF-NORED: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> -; CHECK-TF-NORED: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NORED: %[[LOAD:.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32( ; CHECK-TF-NOREC-LABEL: @interleave( ; CHECK-TF-NOREC: vector.body: -; CHECK-TF-NOREC: %[[LOAD:.*]] = load <8 x float>, <8 x float> -; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> -; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NOREC: %[[LOAD:.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4f32( entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ -; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -enable-sve-interleaved-mem-accesses=false < %s | FileCheck %s ; Ensure that we can vectorize loops such as: ; int *ptr = c; Index: llvm/test/Transforms/LoopVectorize/sve-interleaved-accesses-ic-2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/sve-interleaved-accesses-ic-2.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64-none-linux-gnu -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=2 -enable-interleaved-mem-accesses=true -enable-sve-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check vectorization on an interleaved load group of factor 2 and an interleaved +; store group of factor 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) #1 { +; CHECK-LABEL: @test_array_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[D]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP7]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[TMP7]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[TMP9]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[TMP10]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw [[TMP11]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP18:%.*]] = mul nsw [[TMP12]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i64 -1 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 -1 +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[TMP15]], [[TMP17]], i32* nonnull [[TMP21]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[TMP16]], [[TMP18]], i32* nonnull [[TMP22]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[TMP23]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[C]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[D]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx0, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 + %tmp2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp, %C + %mul = mul nsw i32 %tmp2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + + +attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } Index: llvm/test/Transforms/LoopVectorize/sve-interleaved-accesses.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/sve-interleaved-accesses.ll @@ -0,0 +1,2180 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64-none-linux-gnu -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -enable-sve-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check vectorization on an interleaved load group of factor 2 and an interleaved +; store group of factor 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) #1 { +; CHECK-LABEL: @test_array_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP3:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP3]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw [[TMP5]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i64 -1 +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[TMP7]], [[TMP8]], i32* nonnull [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[C]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[D]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx0, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 + %tmp2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp, %C + %mul = mul nsw i32 %tmp2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 2 with narrower types and an interleaved +; store group of factor 2. + +; short AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; short A = AB[i]; +; short B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@AB_i16 = common global [1024 x i16] zeroinitializer, align 4 + +define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { +; CHECK-LABEL: @test_array_load2_i16_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP6]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP8]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_GATHER1]] to +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 -1 +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[TMP10]], [[TMP12]], i32* nonnull [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load i16, i16* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[CONV]], [[C]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[D]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP20]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 %1 + %2 = load i16, i16* %arrayidx2, align 2 + %conv = sext i16 %0 to i32 + %add3 = add nsw i32 %conv, %C + %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add3, i32* %arrayidx5, align 4 + %conv6 = sext i16 %2 to i32 + %mul = mul nsw i32 %conv6, %D + %arrayidx9 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv, 1022 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 2 and an interleaved +; store group of factor 2 with narrower types. + +; int AB[1024]; +; short CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; short A = AB[i]; +; short B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@CD_i16 = dso_local local_unnamed_addr global [1024 x i16] zeroinitializer, align 2 + +define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { +; CHECK-LABEL: @test_array_load2_store2_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP7:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP7]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[TMP8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = trunc [[TMP11]] to +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[TMP12]], [[TMP13]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw [[TMP9]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP15:%.*]] = trunc [[TMP14]] to +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, [[TMP10]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[TMP15]], [[TMP16]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[C]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[ADD3]] to i16 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i16 [[CONV]], i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP22]], [[D]] +; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[MUL]] to i16 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 [[TMP21]] +; CHECK-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX9]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %0, %C + %conv = trunc i32 %add3 to i16 + %arrayidx5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 %indvars.iv + store i16 %conv, i16* %arrayidx5, align 2 + %mul = mul nsw i32 %2, %D + %conv6 = trunc i32 %mul to i16 + %arrayidx9 = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 %1 + store i16 %conv6, i16* %arrayidx9, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv, 1022 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +%struct.ST3 = type { i32, i32, i32 } +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() #1 { +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[N_VEC]], 3 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP2]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP3]], 12 +; CHECK-NEXT: [[TMP5:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4i32(i32* [[POINTER_PHI]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[TMP5]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[TMP7]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[TMP8]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 -2 +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv4i32( [[TMP9]], [[TMP10]], [[TMP11]], i32* nonnull [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_016:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR_016]], align 4 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 2 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[Y]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 3 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[Z]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %tmp = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %tmp1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %tmp2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %tmp, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %tmp1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %tmp2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 4. + +; struct ST4{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST4 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) #1 { +; CHECK-LABEL: @test_struct_load4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = call { , , , } @llvm.experimental.masked.interleaved4.load.nxv4i32(i32* [[TMP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP3]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP11]] = sub [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_022:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB8:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[R_022]] +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[Z]], align 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[W]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[ADD]], [[TMP2]] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SUB8]] = sub i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ [[SUB8]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %x, align 4 + %add = add nsw i32 %tmp, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %tmp1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %tmp1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %tmp2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %tmp2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %tmp3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %tmp3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; Check vectorization on an interleaved load group of factor 6. +; There is no dedicated ldN/stN so use gather instead + +%struct.ST6 = type { i32, i32, i32, i32, i32, i32 } + +define i32 @test_struct_load6(%struct.ST6* %S) #1 { +; CHECK-LABEL: @test_struct_load6( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], %struct.ST6* [[S:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 2 +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 5 +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]] +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP13]], [[WIDE_MASKED_GATHER4]] +; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP14]], [[WIDE_MASKED_GATHER5]] +; CHECK-NEXT: [[TMP16]] = sub [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_041:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[Z]], align 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[W]], align 4 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 4 +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 5 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[B]], align 4 +; CHECK-NEXT: [[DOTNEG36:%.*]] = add i32 [[TMP21]], [[R_041]] +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[DOTNEG36]], [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP28]], [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP29]], [[TMP26]] +; CHECK-NEXT: [[SUB14]] = sub i32 [[TMP27]], [[TMP30]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUB14_LCSSA:%.*]] = phi i32 [ [[SUB14]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SUB14_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.041 = phi i32 [ 0, %entry ], [ %sub14, %for.body ] + %x = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %z = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %w = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %a = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 4 + %4 = load i32, i32* %a, align 4 + %b = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 5 + %5 = load i32, i32* %b, align 4 + %.neg36 = add i32 %0, %r.041 + %6 = add i32 %.neg36, %2 + %7 = add i32 %1, %3 + %8 = add i32 %7, %4 + %9 = add i32 %8, %5 + %sub14 = sub i32 %6, %9 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body + %sub14.lcssa = phi i32 [ %sub14, %for.body ] + ret i32 %sub14.lcssa +} + +; Check vectorization on an interleaved store group of factor 4. + +; void test_struct_store4(int *A, struct ST4 *B) { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; B[i].x = X + 1; +; B[i].y = X * 2; +; B[i].z = X + 3; +; B[i].w = X + 4; +; } +; } + + +define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) #1 { +; CHECK-LABEL: @test_struct_store4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[POINTER_PHI]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 4, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i64 -3 +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved4.store.nxv4i32( [[TMP5]], [[TMP6]], [[TMP7]], [[TMP8]], i32* nonnull [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_024:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[PTR_024]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR_024]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[MUL]], i32* [[Y]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP]], 3 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[Z]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP]], 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[W]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 + %tmp = load i32, i32* %ptr.024, align 4 + %add = add nsw i32 %tmp, 1 + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %tmp, 1 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %add3 = add nsw i32 %tmp, 3 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 + store i32 %add3, i32* %z, align 4 + %add6 = add nsw i32 %tmp, 4 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 + store i32 %add6, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization on a reverse interleaved load group of factor 2 and +; a reverse interleaved store group of factor 2. + +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + + +%struct.ST2 = type { i32, i32 } + +define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) #1 { +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i32 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[TMP10]], 0 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[TMP10]], 1 +; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i32 1, [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i64 [[TMP19]] +; CHECK-NEXT: [[REVERSE2:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[REVERSE2]], [[REVERSE3]], i32* nonnull [[TMP20]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[A]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[TMP1]] +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[A]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X5]], align 4 +; CHECK-NEXT: [[Y8:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[Y8]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP17:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %x, align 4 + %tmp1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %tmp, %tmp1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %tmp2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %tmp2, %tmp1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_static_tc(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + + +define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) #1 { +; CHECK-LABEL: @even_load_static_tc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP3]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP5:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP4]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = shl nsw [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-NEXT: store [[TMP7]], * [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP19:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_dynamic_tc(int *A, int *B, unsigned N) { +; for (unsigned i = 0; i < N; i+=2) +; B[i/2] = A[i] * 2; +; } + + +define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) #1 { +; CHECK-LABEL: @even_load_dynamic_tc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP10:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = shl nsw [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * +; CHECK-NEXT: store [[TMP12]], * [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on a reverse interleaved load group of factor 2 with 1 +; gap and a reverse interleaved store group of factor 2. The interleaved load +; group should be removed since it has a gap and is reverse. + +; struct pair { +; int x; +; int y; +; }; +; +; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { +; for (int i = 1023; i >= 0; i--) { +; int a = X + i; +; int b = A[i].y - i; +; B[i].x = a; +; B[i].y = b; +; } +; } + +;TODO: still generates gather/scatter loos like instead of a scatter we could have a st2 +%pair = type { i64, i64 } +define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) #1 { +; CHECK-LABEL: @load_gap_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i64 1023, i32 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0i64( [[TMP6]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw [[WIDE_MASKED_GATHER]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( [[TMP4]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( [[TMP7]], [[TMP6]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[I]], [[X]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[I]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = sub nsw i64 [[TMP14]], [[I]] +; CHECK-NEXT: store i64 [[TMP11]], i64* [[TMP12]], align 8 +; CHECK-NEXT: store i64 [[TMP15]], i64* [[TMP13]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nsw i64 [[I]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[I]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] + %0 = add nsw i64 %X, %i + %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 + %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 + %3 = load i64, i64* %2, align 8 + %4 = sub nsw i64 %3, %i + store i64 %0, i64* %1, align 8 + store i64 %4, i64* %2, align 8 + %i.next = add nsw i64 %i, -1 + %cond = icmp sgt i64 %i, 0 + br i1 %cond, label %for.body, label %for.exit + +for.exit: + ret void +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load2_store2(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + + +define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) #1 { +; CHECK-LABEL: @mixed_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP3:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP3]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 -1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 [[TMP6]] +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[TMP7]], [[TMP11]], i32* [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 + %tmp2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp2, %tmp + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %tmp3 = load i32, i32* %arrayidx, align 4 + %tmp4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %tmp4, %tmp3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load3_store3(int *A) { +; for (unsigned i = 0; i < 1024; i++) { +; *A++ += i; +; *A++ += i; +; *A++ += i; +; } +; } + +define void @mixed_load3_store3(i32* nocapture %A) #1 { +; CHECK-LABEL: @mixed_load3_store3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[N_VEC]], 3 +; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP8:%.*]] = call { , , } @llvm.experimental.masked.interleaved3.load.nxv4i32(i32* [[POINTER_PHI]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , } [[TMP8]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP9]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP11]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved3.store.nxv4i32( [[TMP12]], [[TMP13]], [[TMP14]], i32* [[POINTER_PHI]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32* [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_ADDR_012:%.*]] = phi i32* [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[A_ADDR_012]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[A_ADDR_012]], align 4 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP1]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD2]], i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP2]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD4]], i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_013]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 + %tmp = load i32, i32* %A.addr.012, align 4 + %add = add i32 %tmp, %i.013 + store i32 %add, i32* %A.addr.012, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 + %tmp1 = load i32, i32* %incdec.ptr, align 4 + %add2 = add i32 %tmp1, %i.013 + store i32 %add2, i32* %incdec.ptr, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 + %tmp2 = load i32, i32* %incdec.ptr1, align 4 + %add4 = add i32 %tmp2, %i.013 + store i32 %add4, i32* %incdec.ptr1, align 4 + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization on interleaved access groups with members having different +; kinds of type. + +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 { +; CHECK-LABEL: @int_float_struct( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ insertelement ( zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP2]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP3]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: [[TMP7]] = add [[TMP4]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8]] = fadd fast [[VEC_PHI]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 +; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMB_014:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], %struct.IntFloat* [[P]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP]], [[SUMA_013]] +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], %struct.IntFloat* [[P]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 +; CHECK-NEXT: [[ADD3]] = fadd fast float [[SUMB_014]], [[TMP1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %p, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %a, align 4 + %add = add nsw i32 %tmp, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %p, i64 %indvars.iv, i32 1 + %tmp1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %tmp1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization of interleaved access groups in the presence of +; dependences (PR27626). The following tests check that we don't reorder +; dependent loads and stores when generating code for interleaved access +; groups. Stores should be scalarized because the required code motion would +; break dependences, and the remaining interleaved load groups should have +; gaps. + +; PR27626_0: Ensure a strided store is not moved after a dependent (zero +; distance) strided load. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + + +%pair.i32 = type { i32, i32 } +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[BC:%.*]] = bitcast [[TMP9]] to *> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr , * [[TMP11]], i64 0, i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[TMP13]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP14]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_1: Ensure a strided load is not moved before a dependent (zero +; distance) strided store. + +; void PR27626_1(struct pair *p, int n) { +; int s = 0; +; for (int i = 0; i < n; i++) { +; p[i].y = p[i].x; +; s += p[i].y +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define i32 @PR27626_1(%pair.i32 *%p, i64 %n) #1 { +; CHECK-LABEL: @PR27626_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[TMP11]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP12]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[BC:%.*]] = bitcast [[TMP10]] to *> +; CHECK-NEXT: [[TMP13:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr , * [[TMP13]], i64 0, i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[TMP15]], 0 +; CHECK-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP23:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP22]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP23]] = add nsw i32 [[TMP22]], [[S]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 [[TMP23]] +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_2: Ensure a strided store is not moved after a dependent (negative +; distance) strided load. + +; void PR27626_2(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i - 1].x; +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 { +; CHECK-LABEL: @PR27626_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[TMP12]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP13]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4 +; CHECK-NEXT: store i32 [[TMP17]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_minus_1 = add nuw nsw i64 %i, -1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i_minus_1.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_3: Ensure a strided load is not moved before a dependent (negative +; distance) strided store. + +; void PR27626_3(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].y = p[i].x; +; s += p[i].y; +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) #1 { +; CHECK-LABEL: @PR27626_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[TMP13]], 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP14]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call { , } @llvm.experimental.masked.interleaved2.load.nxv4i32(i32* nonnull [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[TMP15]], 0 +; CHECK-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP24]] = add nsw i32 [[TMP23]], [[S]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 [[TMP24]] +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i_plus_1.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_4: Ensure we form an interleaved group for strided stores in the +; presence of a write-after-write dependence. We create a group for +; (2) and (3) while excluding (1). + +; void PR27626_4(int *a, int x, int y, int z, int n) { +; for (int i = 0; i < n; i += 2) { +; a[i] = x; // (1) +; a[i] = y; // (2) +; a[i + 1] = z; // (3) +; } +; } + +;TODO: uses sve masked scatter, but for neon we have a scalarised store for a[i] = x what is fine +define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[A]], i64 -1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i64 [[TMP11]] +; CHECK-NEXT: call void @llvm.experimental.masked.interleaved2.store.nxv4i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]], i32* [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1 +; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]] +; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_plus_1 = add i64 %i, 1 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 + store i32 %x, i32* %a_i, align 4 + store i32 %y, i32* %a_i, align 4 + store i32 %z, i32* %a_i_plus_1, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_5: Ensure we do not form an interleaved group for strided stores in +; the presence of a write-after-write dependence. + +; void PR27626_5(int *a, int x, int y, int z, int n) { +; for (int i = 3; i < n; i += 2) { +; a[i - 1] = x; +; a[i - 3] = y; +; a[i] = z; +; } +; } + + +;TODO: uses masked scatter, but this is a test which checks if interleaving is not used +define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP7]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[TMP9]], shufflevector ( insertelement ( poison, i64 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 -1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 -3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP13]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT2]], [[TMP16]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT4]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1 +; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3 +; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]] +; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]] +; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4 +; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] + %i_minus_1 = sub i64 %i, 1 + %i_minus_3 = sub i64 %i_minus_1, 2 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 + %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 + store i32 %x, i32* %a_i_minus_1, align 4 + store i32 %y, i32* %a_i_minus_3, align 4 + store i32 %z, i32* %a_i, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR34743: Ensure that a cast which needs to sink after a load that belongs to +; an interleaved group, indeeded gets sunk. + +; void PR34743(short *a, int *b, int n) { +; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { +; b[i] = a[iv] * a[iv+1] * a[iv+2]; +; } +; } + + +define void @PR34743(i16* %a, i32* %b, i64 %n) #1 { +; CHECK-LABEL: @PR34743( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[N]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = and i64 [[N]], -2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 3 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[SCEVGEP5]] to i32* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP8]], [[B]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[SCEVGEP]] to i16* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP9]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP11]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[TMP13]], -1 +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i16 [[DOTPRE]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl [[TMP15]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP16]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[A]], [[TMP19]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP21]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !42 +; CHECK-NEXT: [[TMP22:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[A]], [[TMP20]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER8]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP23]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !42 +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.splice.nxv4i16( [[VECTOR_RECUR]], [[WIDE_MASKED_GATHER8]], i32 -1) +; CHECK-NEXT: [[TMP25:%.*]] = sext [[TMP24]] to +; CHECK-NEXT: [[TMP26:%.*]] = sext [[WIDE_MASKED_GATHER8]] to +; CHECK-NEXT: [[TMP27:%.*]] = mul nsw [[TMP25]], [[TMP22]] +; CHECK-NEXT: [[TMP28:%.*]] = mul nsw [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * +; CHECK-NEXT: store [[TMP28]], * [[TMP30]], align 4, !alias.scope !45, !noalias !42 +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = shl nuw nsw i64 [[TMP31]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP32]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP35:%.*]] = shl nuw nsw i32 [[TMP34]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = add nsw i32 [[TMP35]], -1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_MASKED_GATHER8]], i32 [[TMP36]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 +; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1 +; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]] +; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 +; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] +; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP48:![0-9]+]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + %.pre = load i16, i16* %a + br label %loop + +loop: + %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] + %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] + %i = phi i64 [ 0, %entry ], [ %i1, %loop ] + %conv = sext i16 %0 to i32 + %i1 = add nuw nsw i64 %i, 1 + %iv1 = add nuw nsw i64 %iv, 1 + %iv2 = add nuw nsw i64 %iv, 2 + %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 + %load1 = load i16, i16* %gep1, align 4 + %conv1 = sext i16 %load1 to i32 + %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 + %load2 = load i16, i16* %gep2, align 4 + %conv2 = sext i16 %load2 to i32 + %mul01 = mul nsw i32 %conv, %conv1 + %mul012 = mul nsw i32 %mul01, %conv2 + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + store i32 %mul012, i32* %arrayidx5 + %exitcond = icmp eq i64 %iv, %n + br i1 %exitcond, label %end, label %loop + +end: + ret void +} + +attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } +attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }