diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -123,6 +124,7 @@ struct CachingVPExpander { Function &F; const TargetTransformInfo &TTI; + const DataLayout &DL; /// \returns A (fixed length) vector with ascending integer indices /// (<0, 1, ..., NumElems-1>). @@ -148,6 +150,7 @@ Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam, ElementCount ElemCount); + bool isEVLOnlyMemoryIntrinsic(VPIntrinsic &VPI); Value *foldEVLIntoMask(VPIntrinsic &VPI); /// "Remove" the %evl parameter of \p PI by setting it to the static vector @@ -172,9 +175,18 @@ VPLegalization getVPLegalizationStrategy(const VPIntrinsic &VPI) const; bool UsingTTIOverrides; + /// \brief Lower this llvm.vp.(load|store|gather|scatter) to a non-vp + /// instruction. + Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI); + + Value *expandPredicationInUnfoldedLoadStore(IRBuilder<> &Builder, + VPIntrinsic &VPI); + public: - CachingVPExpander(Function &F, const TargetTransformInfo &TTI) - : F(F), TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {} + CachingVPExpander(Function &F, const TargetTransformInfo &TTI, + const DataLayout &DL) + : F(F), TTI(TTI), DL(DL), UsingTTIOverrides(anyExpandVPOverridesSet()) {} bool expandVectorPredication(); }; @@ -383,6 +395,283 @@ return Reduction; } +/// \brief Lower this llvm.vp.(load|store|gather|scatter) to a non-vp +/// instruction. +Value * +CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { + assert(VPI.canIgnoreVectorLengthParam()); + auto &I = cast(VPI); + + auto MaskParam = VPI.getMaskParam(); + auto PtrParam = VPI.getMemoryPointerParam(); + auto DataParam = VPI.getMemoryDataParam(); + bool IsUnmasked = isAllTrueMask(MaskParam); + + MaybeAlign AlignOpt = VPI.getPointerAlignment(); + + Value *NewMemoryInst = nullptr; + switch (VPI.getIntrinsicID()) { + default: + abort(); // not a VP memory intrinsic + + case Intrinsic::vp_store: { + if (IsUnmasked) { + StoreInst *NewStore = Builder.CreateStore(DataParam, PtrParam, false); + if (AlignOpt.hasValue()) + NewStore->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewStore; + } else { + NewMemoryInst = Builder.CreateMaskedStore( + DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam); + } + } break; + + case Intrinsic::vp_load: { + if (IsUnmasked) { + LoadInst *NewLoad = Builder.CreateLoad(VPI.getType(), PtrParam, false); + if (AlignOpt.hasValue()) + NewLoad->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewLoad; + } else { + NewMemoryInst = Builder.CreateMaskedLoad( + VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam); + } + } break; + + case Intrinsic::vp_scatter: { + NewMemoryInst = Builder.CreateMaskedScatter( + DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam); + } break; + + case Intrinsic::vp_gather: { + NewMemoryInst = Builder.CreateMaskedGather(VPI.getType(), PtrParam, + AlignOpt.valueOrOne(), MaskParam, + nullptr, I.getName()); + } break; + } + + assert(NewMemoryInst); + replaceOperation(*NewMemoryInst, VPI); + return NewMemoryInst; +} + +// The following are helper functions for loading and storing subvectors with +// variable offsets. There is currently no support for shuffles with +// non-constant masks, so these operations have to be done lane by lane. + +// Create a load into Dest from the subvector of src given by a variable Offset +// and constant Width. Src is a pointer; Dest is a fixed-width vector; Offset +// and Width are specified in lanes. +Value *LoadSubvector(Value *Dest, Value *Src, Value *Offset, unsigned Width, + MaybeAlign EltAlign, Type *OffsetTy, + Instruction *InsertPt) { + assert(OffsetTy->isIntegerTy() && "Offset must be an integer type!"); + assert(Src->getType()->isPointerTy() && "Source must be a pointer!"); + assert(Dest->getType()->isVectorTy() && "Destination must be a vector!"); + Type *EltTy = Dest->getType()->getScalarType(); + IRBuilder<> Builder(InsertPt); + Builder.SetCurrentDebugLocation(InsertPt->getDebugLoc()); + Value *SrcEltPtr = Builder.CreatePointerCast( + Src, EltTy->getPointerTo(Src->getType()->getPointerAddressSpace())); + auto *SubvecSrc = Builder.CreateInBoundsGEP(EltTy, SrcEltPtr, Offset); + Value *VResult = Dest; + for (unsigned i = 0; i < Width; ++i) { + Value *vi = ConstantInt::get(OffsetTy, i); + auto *EltOffset = Builder.CreateAdd(Offset, vi); + auto *EltPtr = Builder.CreateInBoundsGEP(EltTy, SubvecSrc, vi); + Value *EltLoad = Builder.CreateAlignedLoad(EltTy, EltPtr, EltAlign); + VResult = Builder.CreateInsertElement(VResult, EltLoad, EltOffset); + } + return VResult; +} + +// Create a store into Dest of the subvector of Val given by a variable Offset +// and constant Width. Dest is a pointer; Val is a fixed-width vector; Offset +// and Width are specified in lanes. +void StoreSubvector(Value *Val, Value *Dest, Value *Offset, unsigned Width, + MaybeAlign EltAlign, Type *OffsetTy, + Instruction *InsertPt) { + assert(OffsetTy->isIntegerTy() && "Offset must be an integer type!"); + assert(Dest->getType()->isPointerTy() && "Destination must be a pointer!"); + assert(Val->getType()->isVectorTy() && "Value must be a vector!"); + Type *EltTy = Val->getType()->getScalarType(); + IRBuilder<> Builder(InsertPt); + Builder.SetCurrentDebugLocation(InsertPt->getDebugLoc()); + Value *DestEltPtr = Builder.CreatePointerCast( + Dest, EltTy->getPointerTo(Dest->getType()->getPointerAddressSpace())); + auto *SubvecDest = Builder.CreateInBoundsGEP(EltTy, DestEltPtr, Offset); + for (unsigned i = 0; i < Width; ++i) { + Value *vi = ConstantInt::get(OffsetTy, i); + auto *EltOffset = Builder.CreateAdd(Offset, vi); + auto *EltPtr = Builder.CreateInBoundsGEP(EltTy, SubvecDest, vi); + Value *EltLoad = Builder.CreateExtractElement(Val, EltOffset); + Builder.CreateAlignedStore(EltLoad, EltPtr, EltAlign); + } + return; +} + +// We can split a vector store with variable length into contiguous conditional +// stores of powers of 2, one for each active bit in the length value. The +// offsets of the stores can be computed unconditionally using bitmasks of the +// length. The resulting logic looks like this: +// PreBB: +// // ... before intrinsic call +// goto HeadBB; +// HeadBB: +// if (Length == VectorWidth) +// goto ShortBB; +// else +// goto LongBB; +// ShortBB: +// // load/store full vector +// goto PostBB; +// LongBB: +// for (int i = 0; i < LengthBits; ++i) { +// if (hasBitSet(Length, i)) +// // load/store subvector of width 2^i +// } +// goto PostBB; +// PostBB: +// // after the intrinsic call ... + +Value * +CachingVPExpander::expandPredicationInUnfoldedLoadStore(IRBuilder<> &Builder, + VPIntrinsic &VPI) { + assert(!VPI.canIgnoreVectorLengthParam()); + unsigned OC = *VPI.getFunctionalOpcode(); + + auto &I = cast(VPI); + + Value *VLParam = VPI.getVectorLengthParam(); + Value *PtrParam = VPI.getMemoryPointerParam(); + Value *DataParam = VPI.getMemoryDataParam(); + + MaybeAlign AlignOpt = VPI.getPointerAlignment(); + + Value *NewMemoryInst = nullptr; + char const *Prefix; + + switch (OC) { + default: + abort(); // not a VP load or store + + case Instruction::Load: + Prefix = "vp.load."; + break; + case Instruction::Store: + Prefix = "vp.store."; + break; + } + + bool isLoad = (OC == Instruction::Load); + + auto *VecTy = isLoad ? cast(VPI.getType()) + : cast(DataParam->getType()); + unsigned VecNumElts = VecTy->getNumElements(); + Type *VecEltTy = VecTy->getElementType(); + Type *VLTy = VLParam->getType(); + + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + if (isa(VLParam)) { + switch (OC) { + case Instruction::Load: { + LoadInst *NewLoad = Builder.CreateLoad(VPI.getType(), PtrParam, false); + if (AlignOpt.hasValue()) + NewLoad->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewLoad; + } break; + case Instruction::Store: { + StoreInst *NewStore = Builder.CreateStore(DataParam, PtrParam, false); + if (AlignOpt.hasValue()) + NewStore->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewStore; + } break; + default: + break; + } + replaceOperation(*NewMemoryInst, VPI); + return NewMemoryInst; + } + + Instruction *ShortTerm, *LongTerm, *ThenTerm; + Value *Pred; + const Align BranchAlignment = commonAlignment( + AlignOpt.valueOrOne(), VecEltTy->getPrimitiveSizeInBits() / 8); + + Value *VResult = (isLoad ? UndefValue::get(VecTy) : nullptr); + + Pred = Builder.CreateICmpEQ(VLParam, ConstantInt::get(VLTy, VecNumElts)); + if (!isLoad) + VResult = I.getParent()->getTerminator(); + SplitBlockAndInsertIfThenElse(Pred, &I, &ShortTerm, &LongTerm); + ShortTerm->getParent()->setName(Twine(Prefix) + "short"); + LongTerm->getParent()->setName(Twine(Prefix) + "long"); + I.getParent()->setName(Twine(Prefix) + "exit"); + + unsigned LastBranchBit = Log2_64_Ceil(VecNumElts); + unsigned BranchMask = maskTrailingOnes(LastBranchBit); + unsigned BranchBit = LastBranchBit; + while (BranchBit--) { // postdecr to avoid compairing 0u-1 + unsigned BranchOffsetMask = + maskTrailingOnes(BranchBit + 1) ^ BranchMask; + unsigned BranchWidth = 1 << BranchBit; + Value *BranchWidthValue = ConstantInt::get(VLTy, BranchWidth); + Value *BranchOffsetMaskValue = ConstantInt::get(VLTy, BranchOffsetMask); + + BasicBlock *IfBB = LongTerm->getParent(); + Builder.SetInsertPoint(LongTerm); + Pred = Builder.CreateICmpUGT(Builder.CreateAnd(VLParam, BranchWidthValue), + ConstantInt::get(VLTy, 0)); + ThenTerm = SplitBlockAndInsertIfThen(Pred, LongTerm, /*Unreachable*/ false); + ThenTerm->getParent()->setName(Twine(Prefix) + "branch"); + LongTerm->getParent()->setName(Twine(Prefix) + "long"); + Builder.SetInsertPoint(ThenTerm); + Value *BranchOffsetValue = + Builder.CreateAnd(VLParam, BranchOffsetMaskValue); + + if (isLoad) { + Value *BranchVResult = + LoadSubvector(VResult, PtrParam, BranchOffsetValue, BranchWidth, + BranchAlignment, VLTy, ThenTerm); + Builder.SetInsertPoint(LongTerm); + PHINode *ThenPhi = Builder.CreatePHI(VecTy, 2); + ThenPhi->addIncoming(BranchVResult, ThenTerm->getParent()); + ThenPhi->addIncoming(VResult, IfBB); + VResult = ThenPhi; + } else { + StoreSubvector(DataParam, PtrParam, BranchOffsetValue, BranchWidth, + BranchAlignment, VLTy, ThenTerm); + } + } + + Builder.SetInsertPoint(ShortTerm); + Value *ShortVResult; + if (isLoad) { + LoadInst *ShortLoad = Builder.CreateLoad(VPI.getType(), PtrParam, false); + if (AlignOpt.hasValue()) + ShortLoad->setAlignment(AlignOpt.getValue()); + ShortVResult = ShortLoad; + } else { + StoreInst *ShortStore = Builder.CreateStore(DataParam, PtrParam, false); + if (AlignOpt.hasValue()) + ShortStore->setAlignment(AlignOpt.getValue()); + } + + if (isLoad) { + Builder.SetInsertPoint(&I); + PHINode *Phi = Builder.CreatePHI(VecTy, 2); + Phi->addIncoming(VResult, LongTerm->getParent()); + Phi->addIncoming(ShortVResult, ShortTerm->getParent()); + VResult = Phi; + } + + NewMemoryInst = VResult; + replaceOperation(*NewMemoryInst, VPI); + return NewMemoryInst; +} + void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n"); @@ -412,6 +701,30 @@ VPI.setVectorLengthParam(MaxEVL); } +bool CachingVPExpander::isEVLOnlyMemoryIntrinsic(VPIntrinsic &VPI) { + Value *Mask = VPI.getMaskParam(); + if (!Mask) + return true; + switch (VPI.getIntrinsicID()) { + default: + return false; + case Intrinsic::vp_load: + if ((isAllTrueMask(Mask) || isa(Mask)) && + !TTI.isLegalMaskedLoad(VPI.getType(), + VPI.getPointerAlignment().valueOrOne())) + return true; + else + return false; + case Intrinsic::vp_store: + if ((isAllTrueMask(Mask) || isa(Mask)) && + !TTI.isLegalMaskedLoad(VPI.getMemoryDataParam()->getType(), + VPI.getPointerAlignment().valueOrOne())) + return true; + else + return false; + } +} + Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n'); @@ -430,6 +743,11 @@ LLVM_DEBUG(dbgs() << "OLD evl: " << *OldEVLParam << '\n'); LLVM_DEBUG(dbgs() << "OLD mask: " << *OldMaskParam << '\n'); + // If the mask is trivial, and there is no lowering for the corresponding + // masked intrinsic, then we use the alternate evl-only scalarization. + if (isEVLOnlyMemoryIntrinsic(VPI)) + return nullptr; + // Convert the %evl predication into vector mask predication. ElementCount ElemCount = VPI.getStaticVectorLength(); Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount); @@ -459,6 +777,19 @@ if (auto *VPRI = dyn_cast(&VPI)) return expandPredicationInReduction(Builder, *VPRI); + switch (VPI.getIntrinsicID()) { + default: + abort(); // unexpected intrinsic + case Intrinsic::vp_load: + case Intrinsic::vp_store: + if (isEVLOnlyMemoryIntrinsic(VPI)) { + return expandPredicationInUnfoldedLoadStore(Builder, VPI); + } else { + return expandPredicationInMemoryIntrinsic(Builder, VPI); + } + break; + } + return &VPI; } @@ -572,7 +903,8 @@ bool runOnFunction(Function &F) override { const auto *TTI = &getAnalysis().getTTI(F); - CachingVPExpander VPExpander(F, *TTI); + const auto &DL = F.getParent()->getDataLayout(); + CachingVPExpander VPExpander(F, *TTI, DL); return VPExpander.expandVectorPredication(); } @@ -598,7 +930,8 @@ PreservedAnalyses ExpandVectorPredicationPass::run(Function &F, FunctionAnalysisManager &AM) { const auto &TTI = AM.getResult(F); - CachingVPExpander VPExpander(F, TTI); + const auto &DL = F.getParent()->getDataLayout(); + CachingVPExpander VPExpander(F, TTI, DL); if (!VPExpander.expandVectorPredication()) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/test/CodeGen/PowerPC/ldst-with-length-scalar.ll b/llvm/test/CodeGen/PowerPC/ldst-with-length-scalar.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ldst-with-length-scalar.ll @@ -0,0 +1,1704 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -expandvp -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +define void @store_vl_v4i32(<4 x i32>* %ptr, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 4 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <4 x i32> [[VAL:%.*]], <4 x i32>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[VAL]], i32 [[TMP18]] +; CHECK-NEXT: store i32 [[TMP20]], i32* [[TMP19]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i32(<4 x i32>, <4 x i32>*, <4 x i1>, i32) +define <4 x i32> @load_vl_v4i32_i32(<4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v4i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 4 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ [[TMP15]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP19]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP24]], i32 [[TMP22]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP25]], [[VP_LOAD_BRANCH2]] ], [ [[TMP16]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP26]], [[VP_LOAD_LONG3]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <4 x i32> [[TMP27]] +; + %res = call <4 x i32> @llvm.vp.load.v4i32(<4 x i32>* %ptr, <4 x i1> undef, i32 %evl) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.vp.load.v4i32(<4 x i32>*, <4 x i1>, i32) + +define void @store_vl_v3i32(<3 x i32>* %ptr, <3 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v3i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <3 x i32> [[VAL:%.*]], <3 x i32>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <3 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <3 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <3 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x i32> [[VAL]], i32 [[TMP18]] +; CHECK-NEXT: store i32 [[TMP20]], i32* [[TMP19]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v3i32(<3 x i32> %val, <3 x i32>* %ptr, <3 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v3i32(<3 x i32>, <3 x i32>*, <3 x i1>, i32) +define <3 x i32> @load_vl_v3i32_i32(<3 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v3i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, <3 x i32>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <3 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP16:%.*]] = phi <3 x i32> [ [[TMP15]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <3 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP19]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <3 x i32> [[TMP16]], i32 [[TMP24]], i32 [[TMP22]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP26:%.*]] = phi <3 x i32> [ [[TMP25]], [[VP_LOAD_BRANCH2]] ], [ [[TMP16]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP27:%.*]] = phi <3 x i32> [ [[TMP26]], [[VP_LOAD_LONG3]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <3 x i32> [[TMP27]] +; + %res = call <3 x i32> @llvm.vp.load.v3i32(<3 x i32>* %ptr, <3 x i1> undef, i32 %evl) + ret <3 x i32> %res +} +declare <3 x i32> @llvm.vp.load.v3i32(<3 x i32>*, <3 x i1>, i32) + +define void @store_vl_v2i32(<2 x i32>* %ptr, <2 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v2i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 2 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <2 x i32> [[VAL:%.*]], <2 x i32>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i32(<2 x i32> %val, <2 x i32>* %ptr, <2 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v2i32(<2 x i32>, <2 x i32>*, <2 x i1>, i32) +define <2 x i32> @load_vl_v2i32_i32(<2 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v2i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 2 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP11]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP12]], [[VP_LOAD_LONG1]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <2 x i32> [[TMP13]] +; + %res = call <2 x i32> @llvm.vp.load.v2i32(<2 x i32>* %ptr, <2 x i1> undef, i32 %evl) + ret <2 x i32> %res +} +declare <2 x i32> @llvm.vp.load.v2i32(<2 x i32>*, <2 x i1>, i32) + +define void @store_vl_v7i32(<7 x i32>* %ptr, <7 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v7i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 7 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <7 x i32> [[VAL:%.*]], <7 x i32>* [[PTR:%.*]], align 32 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i32 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[TMP20]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP24]] +; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP27]] +; CHECK-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP32]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <7 x i32> [[VAL]], i32 [[TMP35]] +; CHECK-NEXT: store i32 [[TMP37]], i32* [[TMP36]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v7i32(<7 x i32> %val, <7 x i32>* %ptr, <7 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i32(<7 x i32>, <7 x i32>*, <7 x i1>, i32) +define <7 x i32> @load_vl_v7i32_i32(<7 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v7i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 7 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <7 x i32>, <7 x i32>* [[PTR:%.*]], align 32 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <7 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <7 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <7 x i32> [[TMP15]], i32 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <7 x i32> [[TMP19]], i32 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP24:%.*]] = phi <7 x i32> [ [[TMP23]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i32 [[TMP25]], 0 +; CHECK-NEXT: br i1 [[TMP26]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP28]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP27]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <7 x i32> [[TMP24]], i32 [[TMP32]], i32 [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP27]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <7 x i32> [[TMP33]], i32 [[TMP36]], i32 [[TMP34]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP38:%.*]] = phi <7 x i32> [ [[TMP37]], [[VP_LOAD_BRANCH2]] ], [ [[TMP24]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp ugt i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <7 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[TMP42]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <7 x i32> [[TMP38]], i32 [[TMP46]], i32 [[TMP44]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP48:%.*]] = phi <7 x i32> [ [[TMP47]], [[VP_LOAD_BRANCH4]] ], [ [[TMP38]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP49:%.*]] = phi <7 x i32> [ [[TMP48]], [[VP_LOAD_LONG5]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <7 x i32> [[TMP49]] +; + %res = call <7 x i32> @llvm.vp.load.v7i32(<7 x i32>* %ptr, <7 x i1> undef, i32 %evl) + ret <7 x i32> %res +} +declare <7 x i32> @llvm.vp.load.v7i32(<7 x i32>*, <7 x i1>, i32) + +define void @store_vl_v8i32(<8 x i32>* %ptr, <8 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 8 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <8 x i32> [[VAL:%.*]], <8 x i32>* [[PTR:%.*]], align 32 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i32 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[TMP20]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP24]] +; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP27]] +; CHECK-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP32]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[VAL]], i32 [[TMP35]] +; CHECK-NEXT: store i32 [[TMP37]], i32* [[TMP36]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v8i32(<8 x i32> %val, <8 x i32>* %ptr, <8 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i32(<8 x i32>, <8 x i32>*, <8 x i1>, i32) +define <8 x i32> @load_vl_v8i32_i32(<8 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v8i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 8 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 32 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP24:%.*]] = phi <8 x i32> [ [[TMP23]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i32 [[TMP25]], 0 +; CHECK-NEXT: br i1 [[TMP26]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP28]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP27]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP32]], i32 [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP27]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP36]], i32 [[TMP34]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP38:%.*]] = phi <8 x i32> [ [[TMP37]], [[VP_LOAD_BRANCH2]] ], [ [[TMP24]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp ugt i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[TMP42]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <8 x i32> [[TMP38]], i32 [[TMP46]], i32 [[TMP44]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP48:%.*]] = phi <8 x i32> [ [[TMP47]], [[VP_LOAD_BRANCH4]] ], [ [[TMP38]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP49:%.*]] = phi <8 x i32> [ [[TMP48]], [[VP_LOAD_LONG5]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <8 x i32> [[TMP49]] +; + %res = call <8 x i32> @llvm.vp.load.v8i32(<8 x i32>* %ptr, <8 x i1> undef, i32 %evl) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.vp.load.v8i32(<8 x i32>*, <8 x i1>, i32) + +define void @store_vl_v15i32(<15 x i32>* %ptr, <15 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v15i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 15 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <15 x i32> [[VAL:%.*]], <15 x i32>* [[PTR:%.*]], align 64 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP4]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP19]] +; CHECK-NEXT: store i32 [[TMP21]], i32* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP4]], 5 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP24]], i32* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP4]], 6 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP25]] +; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP26]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP4]], 7 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 7 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP28]] +; CHECK-NEXT: store i32 [[TMP30]], i32* [[TMP29]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ugt i32 [[TMP31]], 0 +; CHECK-NEXT: br i1 [[TMP32]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP34:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TMP34]], i32 [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP33]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP36]] +; CHECK-NEXT: store i32 [[TMP38]], i32* [[TMP37]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP33]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP39]] +; CHECK-NEXT: store i32 [[TMP41]], i32* [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP33]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 2 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP42]] +; CHECK-NEXT: store i32 [[TMP44]], i32* [[TMP43]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP33]], 3 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP45]] +; CHECK-NEXT: store i32 [[TMP47]], i32* [[TMP46]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP49:%.*]] = icmp ugt i32 [[TMP48]], 0 +; CHECK-NEXT: br i1 [[TMP49]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[EVL]], 12 +; CHECK-NEXT: [[TMP51:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TMP51]], i32 [[TMP50]] +; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[TMP50]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP53]] +; CHECK-NEXT: store i32 [[TMP55]], i32* [[TMP54]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP50]], 1 +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 1 +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP56]] +; CHECK-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: [[TMP59:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP60:%.*]] = icmp ugt i32 [[TMP59]], 0 +; CHECK-NEXT: br i1 [[TMP60]], label [[VP_STORE_BRANCH6:%.*]], label [[VP_STORE_LONG7:%.*]] +; CHECK: vp.store.branch6: +; CHECK-NEXT: [[TMP61:%.*]] = and i32 [[EVL]], 14 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[TMP62]], i32 [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = add i32 [[TMP61]], 0 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TMP63]], i32 0 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <15 x i32> [[VAL]], i32 [[TMP64]] +; CHECK-NEXT: store i32 [[TMP66]], i32* [[TMP65]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG7]] +; CHECK: vp.store.long7: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v15i32(<15 x i32> %val, <15 x i32>* %ptr, <15 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v15i32(<15 x i32>, <15 x i32>*, <15 x i1>, i32) +define <15 x i32> @load_vl_v15i32_i32(<15 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v15i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 15 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <15 x i32>, <15 x i32>* [[PTR:%.*]], align 64 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <15 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <15 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <15 x i32> [[TMP15]], i32 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <15 x i32> [[TMP19]], i32 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP5]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <15 x i32> [[TMP23]], i32 [[TMP26]], i32 [[TMP24]] +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP5]], 5 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 5 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 1 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <15 x i32> [[TMP27]], i32 [[TMP30]], i32 [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP5]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[TMP33]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <15 x i32> [[TMP31]], i32 [[TMP34]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], 7 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <15 x i32> [[TMP35]], i32 [[TMP38]], i32 [[TMP36]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP40:%.*]] = phi <15 x i32> [ [[TMP39]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP42:%.*]] = icmp ugt i32 [[TMP41]], 0 +; CHECK-NEXT: br i1 [[TMP42]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i32 [[TMP43]] +; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP43]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <15 x i32> [[TMP40]], i32 [[TMP48]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = add i32 [[TMP43]], 1 +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 1 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <15 x i32> [[TMP49]], i32 [[TMP52]], i32 [[TMP50]] +; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP43]], 2 +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 2 +; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <15 x i32> [[TMP53]], i32 [[TMP56]], i32 [[TMP54]] +; CHECK-NEXT: [[TMP58:%.*]] = add i32 [[TMP43]], 3 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 3 +; CHECK-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 1 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <15 x i32> [[TMP57]], i32 [[TMP60]], i32 [[TMP58]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP62:%.*]] = phi <15 x i32> [ [[TMP61]], [[VP_LOAD_BRANCH2]] ], [ [[TMP40]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP63:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt i32 [[TMP63]], 0 +; CHECK-NEXT: br i1 [[TMP64]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP65:%.*]] = and i32 [[EVL]], 12 +; CHECK-NEXT: [[TMP66:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP66]], i32 [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP65]], 0 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP67]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = load i32, i32* [[TMP69]], align 1 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <15 x i32> [[TMP62]], i32 [[TMP70]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP65]], 1 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP67]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = load i32, i32* [[TMP73]], align 1 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <15 x i32> [[TMP71]], i32 [[TMP74]], i32 [[TMP72]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP76:%.*]] = phi <15 x i32> [ [[TMP75]], [[VP_LOAD_BRANCH4]] ], [ [[TMP62]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: [[TMP77:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP78:%.*]] = icmp ugt i32 [[TMP77]], 0 +; CHECK-NEXT: br i1 [[TMP78]], label [[VP_LOAD_BRANCH6:%.*]], label [[VP_LOAD_LONG7:%.*]] +; CHECK: vp.load.branch6: +; CHECK-NEXT: [[TMP79:%.*]] = and i32 [[EVL]], 14 +; CHECK-NEXT: [[TMP80:%.*]] = bitcast <15 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, i32* [[TMP80]], i32 [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP79]], 0 +; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[TMP81]], i32 0 +; CHECK-NEXT: [[TMP84:%.*]] = load i32, i32* [[TMP83]], align 1 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <15 x i32> [[TMP76]], i32 [[TMP84]], i32 [[TMP82]] +; CHECK-NEXT: br label [[VP_LOAD_LONG7]] +; CHECK: vp.load.long7: +; CHECK-NEXT: [[TMP86:%.*]] = phi <15 x i32> [ [[TMP85]], [[VP_LOAD_BRANCH6]] ], [ [[TMP76]], [[VP_LOAD_LONG5]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP87:%.*]] = phi <15 x i32> [ [[TMP86]], [[VP_LOAD_LONG7]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <15 x i32> [[TMP87]] +; + %res = call <15 x i32> @llvm.vp.load.v15i32(<15 x i32>* %ptr, <15 x i1> undef, i32 %evl) + ret <15 x i32> %res +} +declare <15 x i32> @llvm.vp.load.v15i32(<15 x i32>*, <15 x i1>, i32) + +define void @store_vl_v16i32(<16 x i32>* %ptr, <16 x i32> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v16i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 16 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <16 x i32> [[VAL:%.*]], <16 x i32>* [[PTR:%.*]], align 64 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP4]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP19]] +; CHECK-NEXT: store i32 [[TMP21]], i32* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP4]], 5 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP24]], i32* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP4]], 6 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP25]] +; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP26]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP4]], 7 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 7 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP28]] +; CHECK-NEXT: store i32 [[TMP30]], i32* [[TMP29]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ugt i32 [[TMP31]], 0 +; CHECK-NEXT: br i1 [[TMP32]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP34:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TMP34]], i32 [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP33]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP36]] +; CHECK-NEXT: store i32 [[TMP38]], i32* [[TMP37]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP33]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP39]] +; CHECK-NEXT: store i32 [[TMP41]], i32* [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP33]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 2 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP42]] +; CHECK-NEXT: store i32 [[TMP44]], i32* [[TMP43]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP33]], 3 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP45]] +; CHECK-NEXT: store i32 [[TMP47]], i32* [[TMP46]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP49:%.*]] = icmp ugt i32 [[TMP48]], 0 +; CHECK-NEXT: br i1 [[TMP49]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[EVL]], 12 +; CHECK-NEXT: [[TMP51:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TMP51]], i32 [[TMP50]] +; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[TMP50]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP53]] +; CHECK-NEXT: store i32 [[TMP55]], i32* [[TMP54]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP50]], 1 +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 1 +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP56]] +; CHECK-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: [[TMP59:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP60:%.*]] = icmp ugt i32 [[TMP59]], 0 +; CHECK-NEXT: br i1 [[TMP60]], label [[VP_STORE_BRANCH6:%.*]], label [[VP_STORE_LONG7:%.*]] +; CHECK: vp.store.branch6: +; CHECK-NEXT: [[TMP61:%.*]] = and i32 [[EVL]], 14 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[TMP62]], i32 [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = add i32 [[TMP61]], 0 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TMP63]], i32 0 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <16 x i32> [[VAL]], i32 [[TMP64]] +; CHECK-NEXT: store i32 [[TMP66]], i32* [[TMP65]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG7]] +; CHECK: vp.store.long7: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v16i32(<16 x i32> %val, <16 x i32>* %ptr, <16 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v16i32(<16 x i32>, <16 x i32>*, <16 x i1>, i32) +define <16 x i32> @load_vl_v16i32_i32(<16 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v16i32_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 16 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 64 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> undef, i32 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP5]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP26]], i32 [[TMP24]] +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP5]], 5 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 5 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 1 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP30]], i32 [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP5]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[TMP33]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP34]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], 7 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP38]], i32 [[TMP36]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP40:%.*]] = phi <16 x i32> [ [[TMP39]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP42:%.*]] = icmp ugt i32 [[TMP41]], 0 +; CHECK-NEXT: br i1 [[TMP42]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[EVL]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i32 [[TMP43]] +; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP43]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP48]], i32 [[TMP46]] +; CHECK-NEXT: [[TMP50:%.*]] = add i32 [[TMP43]], 1 +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 1 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <16 x i32> [[TMP49]], i32 [[TMP52]], i32 [[TMP50]] +; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP43]], 2 +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 2 +; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <16 x i32> [[TMP53]], i32 [[TMP56]], i32 [[TMP54]] +; CHECK-NEXT: [[TMP58:%.*]] = add i32 [[TMP43]], 3 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP45]], i32 3 +; CHECK-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 1 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <16 x i32> [[TMP57]], i32 [[TMP60]], i32 [[TMP58]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP62:%.*]] = phi <16 x i32> [ [[TMP61]], [[VP_LOAD_BRANCH2]] ], [ [[TMP40]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP63:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt i32 [[TMP63]], 0 +; CHECK-NEXT: br i1 [[TMP64]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP65:%.*]] = and i32 [[EVL]], 12 +; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP66]], i32 [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP65]], 0 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP67]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = load i32, i32* [[TMP69]], align 1 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i32> [[TMP62]], i32 [[TMP70]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP65]], 1 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP67]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = load i32, i32* [[TMP73]], align 1 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i32> [[TMP71]], i32 [[TMP74]], i32 [[TMP72]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP76:%.*]] = phi <16 x i32> [ [[TMP75]], [[VP_LOAD_BRANCH4]] ], [ [[TMP62]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: [[TMP77:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP78:%.*]] = icmp ugt i32 [[TMP77]], 0 +; CHECK-NEXT: br i1 [[TMP78]], label [[VP_LOAD_BRANCH6:%.*]], label [[VP_LOAD_LONG7:%.*]] +; CHECK: vp.load.branch6: +; CHECK-NEXT: [[TMP79:%.*]] = and i32 [[EVL]], 14 +; CHECK-NEXT: [[TMP80:%.*]] = bitcast <16 x i32>* [[PTR]] to i32* +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, i32* [[TMP80]], i32 [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP79]], 0 +; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[TMP81]], i32 0 +; CHECK-NEXT: [[TMP84:%.*]] = load i32, i32* [[TMP83]], align 1 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i32> [[TMP76]], i32 [[TMP84]], i32 [[TMP82]] +; CHECK-NEXT: br label [[VP_LOAD_LONG7]] +; CHECK: vp.load.long7: +; CHECK-NEXT: [[TMP86:%.*]] = phi <16 x i32> [ [[TMP85]], [[VP_LOAD_BRANCH6]] ], [ [[TMP76]], [[VP_LOAD_LONG5]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP87:%.*]] = phi <16 x i32> [ [[TMP86]], [[VP_LOAD_LONG7]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <16 x i32> [[TMP87]] +; + %res = call <16 x i32> @llvm.vp.load.v16i32(<16 x i32>* %ptr, <16 x i1> undef, i32 %evl) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.vp.load.v16i32(<16 x i32>*, <16 x i1>, i32) + +define void @store_vl_v8i16(<8 x i16>* %ptr, <8 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v8i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 8 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <8 x i16> [[VAL:%.*]], <8 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP12]], i16* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i16 [[TMP18]], i16* [[TMP17]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i32 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[TMP20]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP22]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP24]] +; CHECK-NEXT: store i16 [[TMP26]], i16* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP27]] +; CHECK-NEXT: store i16 [[TMP29]], i16* [[TMP28]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i16, i16* [[TMP33]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP32]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, i16* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[VAL]], i32 [[TMP35]] +; CHECK-NEXT: store i16 [[TMP37]], i16* [[TMP36]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v8i16(<8 x i16> %val, <8 x i16>* %ptr, <8 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i16(<8 x i16>, <8 x i16>*, <8 x i1>, i32) +define <8 x i16> @load_vl_v8i16_i32(<8 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v8i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 8 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP11]], i16 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP24:%.*]] = phi <8 x i16> [ [[TMP23]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i32 [[TMP25]], 0 +; CHECK-NEXT: br i1 [[TMP26]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[TMP28]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP27]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i16> [[TMP24]], i16 [[TMP32]], i32 [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP27]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[TMP35]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <8 x i16> [[TMP33]], i16 [[TMP36]], i32 [[TMP34]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP38:%.*]] = phi <8 x i16> [ [[TMP37]], [[VP_LOAD_BRANCH2]] ], [ [[TMP24]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp ugt i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i16, i16* [[TMP42]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i16, i16* [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = load i16, i16* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <8 x i16> [[TMP38]], i16 [[TMP46]], i32 [[TMP44]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP48:%.*]] = phi <8 x i16> [ [[TMP47]], [[VP_LOAD_BRANCH4]] ], [ [[TMP38]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP49:%.*]] = phi <8 x i16> [ [[TMP48]], [[VP_LOAD_LONG5]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <8 x i16> [[TMP49]] +; + %res = call <8 x i16> @llvm.vp.load.v8i16(<8 x i16>* %ptr, <8 x i1> undef, i32 %evl) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.vp.load.v8i16(<8 x i16>*, <8 x i1>, i32) + +define void @store_vl_v7i16(<7 x i16>* %ptr, <7 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v7i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 7 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <7 x i16> [[VAL:%.*]], <7 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP12]], i16* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i16 [[TMP18]], i16* [[TMP17]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i32 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[TMP20]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP22]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP24]] +; CHECK-NEXT: store i16 [[TMP26]], i16* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP27]] +; CHECK-NEXT: store i16 [[TMP29]], i16* [[TMP28]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i16, i16* [[TMP33]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP32]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, i16* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <7 x i16> [[VAL]], i32 [[TMP35]] +; CHECK-NEXT: store i16 [[TMP37]], i16* [[TMP36]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v7i16(<7 x i16> %val, <7 x i16>* %ptr, <7 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i16(<7 x i16>, <7 x i16>*, <7 x i1>, i32) +define <7 x i16> @load_vl_v7i16_i32(<7 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v7i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 7 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <7 x i16>, <7 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <7 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <7 x i16> [[TMP11]], i16 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <7 x i16> [[TMP15]], i16 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <7 x i16> [[TMP19]], i16 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP24:%.*]] = phi <7 x i16> [ [[TMP23]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i32 [[TMP25]], 0 +; CHECK-NEXT: br i1 [[TMP26]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[TMP28]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP27]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <7 x i16> [[TMP24]], i16 [[TMP32]], i32 [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP27]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[TMP35]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <7 x i16> [[TMP33]], i16 [[TMP36]], i32 [[TMP34]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP38:%.*]] = phi <7 x i16> [ [[TMP37]], [[VP_LOAD_BRANCH2]] ], [ [[TMP24]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp ugt i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <7 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i16, i16* [[TMP42]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i16, i16* [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = load i16, i16* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <7 x i16> [[TMP38]], i16 [[TMP46]], i32 [[TMP44]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP48:%.*]] = phi <7 x i16> [ [[TMP47]], [[VP_LOAD_BRANCH4]] ], [ [[TMP38]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP49:%.*]] = phi <7 x i16> [ [[TMP48]], [[VP_LOAD_LONG5]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <7 x i16> [[TMP49]] +; + %res = call <7 x i16> @llvm.vp.load.v7i16(<7 x i16>* %ptr, <7 x i1> undef, i32 %evl) + ret <7 x i16> %res +} +declare <7 x i16> @llvm.vp.load.v7i16(<7 x i16>*, <7 x i1>, i32) + +define void @store_vl_v6i16(<6 x i16>* %ptr, <6 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v6i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 6 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <6 x i16> [[VAL:%.*]], <6 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP12]], i16* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP13]] +; CHECK-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP16]] +; CHECK-NEXT: store i16 [[TMP18]], i16* [[TMP17]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ugt i32 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[TMP20]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP22]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP24]] +; CHECK-NEXT: store i16 [[TMP26]], i16* [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP27]] +; CHECK-NEXT: store i16 [[TMP29]], i16* [[TMP28]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ugt i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[VP_STORE_BRANCH4:%.*]], label [[VP_STORE_LONG5:%.*]] +; CHECK: vp.store.branch4: +; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i16, i16* [[TMP33]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP32]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, i16* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <6 x i16> [[VAL]], i32 [[TMP35]] +; CHECK-NEXT: store i16 [[TMP37]], i16* [[TMP36]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG5]] +; CHECK: vp.store.long5: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v6i16(<6 x i16> %val, <6 x i16>* %ptr, <6 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v6i16(<6 x i16>, <6 x i16>*, <6 x i1>, i32) +define <6 x i16> @load_vl_v6i16_i32(<6 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v6i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 6 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <6 x i16>, <6 x i16>* [[PTR:%.*]], align 16 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i16> [[TMP11]], i16 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <6 x i16> [[TMP15]], i16 [[TMP18]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <6 x i16> [[TMP19]], i16 [[TMP22]], i32 [[TMP20]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP24:%.*]] = phi <6 x i16> [ [[TMP23]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ugt i32 [[TMP25]], 0 +; CHECK-NEXT: br i1 [[TMP26]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[EVL]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[TMP28]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP27]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <6 x i16> [[TMP24]], i16 [[TMP32]], i32 [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP27]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i16, i16* [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[TMP35]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <6 x i16> [[TMP33]], i16 [[TMP36]], i32 [[TMP34]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP38:%.*]] = phi <6 x i16> [ [[TMP37]], [[VP_LOAD_BRANCH2]] ], [ [[TMP24]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp ugt i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[VP_LOAD_BRANCH4:%.*]], label [[VP_LOAD_LONG5:%.*]] +; CHECK: vp.load.branch4: +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[EVL]], 6 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <6 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i16, i16* [[TMP42]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i16, i16* [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = load i16, i16* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <6 x i16> [[TMP38]], i16 [[TMP46]], i32 [[TMP44]] +; CHECK-NEXT: br label [[VP_LOAD_LONG5]] +; CHECK: vp.load.long5: +; CHECK-NEXT: [[TMP48:%.*]] = phi <6 x i16> [ [[TMP47]], [[VP_LOAD_BRANCH4]] ], [ [[TMP38]], [[VP_LOAD_LONG3]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP49:%.*]] = phi <6 x i16> [ [[TMP48]], [[VP_LOAD_LONG5]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <6 x i16> [[TMP49]] +; + %res = call <6 x i16> @llvm.vp.load.v6i16(<6 x i16>* %ptr, <6 x i1> undef, i32 %evl) + ret <6 x i16> %res +} +declare <6 x i16> @llvm.vp.load.v6i16(<6 x i16>*, <6 x i1>, i32) + +define void @store_vl_v4i16(<4 x i16>* %ptr, <4 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v4i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 4 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <4 x i16> [[VAL:%.*]], <4 x i16>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP12]], i16* [[TMP11]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i16> [[VAL]], i32 [[TMP18]] +; CHECK-NEXT: store i16 [[TMP20]], i16* [[TMP19]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v4i16(<4 x i16> %val, <4 x i16>* %ptr, <4 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i16(<4 x i16>, <4 x i16>*, <4 x i1>, i32) +define <4 x i16> @load_vl_v4i16_i32(<4 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v4i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 4 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i16> [ [[TMP15]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP20]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP19]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP24]], i32 [[TMP22]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i16> [ [[TMP25]], [[VP_LOAD_BRANCH2]] ], [ [[TMP16]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i16> [ [[TMP26]], [[VP_LOAD_LONG3]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <4 x i16> [[TMP27]] +; + %res = call <4 x i16> @llvm.vp.load.v4i16(<4 x i16>* %ptr, <4 x i1> undef, i32 %evl) + ret <4 x i16> %res +} +declare <4 x i16> @llvm.vp.load.v4i16(<4 x i16>*, <4 x i1>, i32) + +define void @store_vl_v3i16(<3 x i16>* %ptr, <3 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v3i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <3 x i16> [[VAL:%.*]], <3 x i16>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <3 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <3 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x i16> [[VAL]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP12]], i16* [[TMP11]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VP_STORE_BRANCH2:%.*]], label [[VP_STORE_LONG3:%.*]] +; CHECK: vp.store.branch2: +; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <3 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x i16> [[VAL]], i32 [[TMP18]] +; CHECK-NEXT: store i16 [[TMP20]], i16* [[TMP19]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG3]] +; CHECK: vp.store.long3: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v3i16(<3 x i16> %val, <3 x i16>* %ptr, <3 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v3i16(<3 x i16>, <3 x i16>*, <3 x i1>, i32) +define <3 x i16> @load_vl_v3i16_i32(<3 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v3i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <3 x i16>, <3 x i16>* [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <3 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x i16> [[TMP11]], i16 [[TMP14]], i32 [[TMP12]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP16:%.*]] = phi <3 x i16> [ [[TMP15]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i32 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[VP_LOAD_BRANCH2:%.*]], label [[VP_LOAD_LONG3:%.*]] +; CHECK: vp.load.branch2: +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[EVL]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <3 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP20]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP19]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <3 x i16> [[TMP16]], i16 [[TMP24]], i32 [[TMP22]] +; CHECK-NEXT: br label [[VP_LOAD_LONG3]] +; CHECK: vp.load.long3: +; CHECK-NEXT: [[TMP26:%.*]] = phi <3 x i16> [ [[TMP25]], [[VP_LOAD_BRANCH2]] ], [ [[TMP16]], [[VP_LOAD_LONG1]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP27:%.*]] = phi <3 x i16> [ [[TMP26]], [[VP_LOAD_LONG3]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <3 x i16> [[TMP27]] +; + %res = call <3 x i16> @llvm.vp.load.v3i16(<3 x i16>* %ptr, <3 x i1> undef, i32 %evl) + ret <3 x i16> %res +} +declare <3 x i16> @llvm.vp.load.v3i16(<3 x i16>*, <3 x i1>, i32) + +define void @store_vl_v2i16(<2 x i16>* %ptr, <2 x i16> %val, i32 %evl) { +; CHECK-LABEL: @store_vl_v2i16( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 2 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_STORE_SHORT:%.*]], label [[VP_STORE_LONG:%.*]] +; CHECK: vp.store.short: +; CHECK-NEXT: store <2 x i16> [[VAL:%.*]], <2 x i16>* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[VP_STORE_EXIT:%.*]] +; CHECK: vp.store.long: +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VP_STORE_BRANCH:%.*]], label [[VP_STORE_LONG1:%.*]] +; CHECK: vp.store.branch: +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[VAL]], i32 [[TMP7]] +; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +; CHECK-NEXT: br label [[VP_STORE_LONG1]] +; CHECK: vp.store.long1: +; CHECK-NEXT: br label [[VP_STORE_EXIT]] +; CHECK: vp.store.exit: +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i16(<2 x i16> %val, <2 x i16>* %ptr, <2 x i1> undef, i32 %evl) + ret void +} +declare void @llvm.vp.store.v2i16(<2 x i16>, <2 x i16>*, <2 x i1>, i32) +define <2 x i16> @load_vl_v2i16_i32(<2 x i16>* %ptr, i32 %evl) { +; CHECK-LABEL: @load_vl_v2i16_i32( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[EVL:%.*]], 2 +; CHECK-NEXT: br i1 [[TMP1]], label [[VP_LOAD_SHORT:%.*]], label [[VP_LOAD_LONG:%.*]] +; CHECK: vp.load.short: +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[VP_LOAD_EXIT:%.*]] +; CHECK: vp.load.long: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[EVL]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[VP_LOAD_BRANCH:%.*]], label [[VP_LOAD_LONG1:%.*]] +; CHECK: vp.load.branch: +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[EVL]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i16>* [[PTR]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> undef, i16 [[TMP10]], i32 [[TMP8]] +; CHECK-NEXT: br label [[VP_LOAD_LONG1]] +; CHECK: vp.load.long1: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP11]], [[VP_LOAD_BRANCH]] ], [ undef, [[VP_LOAD_LONG]] ] +; CHECK-NEXT: br label [[VP_LOAD_EXIT]] +; CHECK: vp.load.exit: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP12]], [[VP_LOAD_LONG1]] ], [ [[TMP2]], [[VP_LOAD_SHORT]] ] +; CHECK-NEXT: ret <2 x i16> [[TMP13]] +; + %res = call <2 x i16> @llvm.vp.load.v2i16(<2 x i16>* %ptr, <2 x i1> undef, i32 %evl) + ret <2 x i16> %res +} +declare <2 x i16> @llvm.vp.load.v2i16(<2 x i16>*, <2 x i1>, i32) +