Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -604,7 +604,7 @@ /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. - bool isLegalInterleavedAccessType(VectorType *VecTy, + bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16647,16 +16647,21 @@ return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } -bool ARMTargetLowering::isLegalInterleavedAccessType( +bool ARMTargetLowering::isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) + return false; + // Ensure the vector doesn't have f16 elements. Even though we could do an // i16 vldN, we can't hold the f16 vectors and will end up converting via // f32. - if (VecTy->getElementType()->isHalfTy()) + if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) + return false; + if (Subtarget->hasMVEIntegerOps() && Factor == 3) return false; // Ensure the number of vector elements is greater than 1. @@ -16669,12 +16674,16 @@ // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. - return VecSize == 64 || VecSize % 128 == 0; + if (Subtarget->hasNEON() && VecSize == 64) + return true; + return VecSize % 128 == 0; } unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { if (Subtarget->hasNEON()) return 4; + if (Subtarget->hasMVEIntegerOps()) + return 4; return TargetLoweringBase::getMaxSupportedInterleaveFactor(); } @@ -16706,7 +16715,7 @@ // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -16738,13 +16747,34 @@ assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Type *Tys[] = {VecTy, Int8Ptr}; - static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, - Intrinsic::arm_neon_vld3, - Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + auto createLoadIntrinsic = [&](Value *BaseAddr) { + if (Subtarget->hasNEON()) { + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } + else { + assert((Factor == 2 || Factor == 4) && "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID LoadInts = Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; + Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, VecEltTy}; + Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); + dbgs() << *BaseAddr << "\n"; + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } + }; // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will @@ -16759,11 +16789,7 @@ Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, VecTy->getVectorNumElements() * Factor); - SmallVector Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); - - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + CallInst *VldN = createLoadIntrinsic(BaseAddr); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. @@ -16842,7 +16868,7 @@ // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); @@ -16886,11 +16912,46 @@ auto Mask = SVI->getShuffleMask(); - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Type *Tys[] = {Int8Ptr, SubVecTy}; - static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; + auto createStoreIntrinsic = [&](Value* BaseAddr, SmallVectorImpl &Shuffles) { + if (Subtarget->hasNEON()) { + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; + + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + for (auto S : Shuffles) + Ops.push_back(S); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } + else { + assert((Factor == 2 || Factor == 4) && "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID StoreInts = Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; + Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(SI->getPointerAddressSpace()); + Type *Tys[] = {EltPtrTy, SubVecTy}; + Function *VstNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); + for (auto S : Shuffles) + Ops.push_back(S); + Ops.push_back(Builder.getInt32(0)); + Builder.CreateCall(VstNFunc, Ops); + Ops.back() = Builder.getInt32(1); + Builder.CreateCall(VstNFunc, Ops); + if (Factor == 4) { + Ops.back() = Builder.getInt32(2); + Builder.CreateCall(VstNFunc, Ops); + Ops.back() = Builder.getInt32(3); + Builder.CreateCall(VstNFunc, Ops); + } + } + }; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { // If we generating more than one store, we compute the base address of @@ -16899,17 +16960,13 @@ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); - SmallVector Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - - Function *VstNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + SmallVector Shuffles; // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; @@ -16926,13 +16983,12 @@ // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + createStoreIntrinsic(BaseAddr, Shuffles); } return true; } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -773,7 +773,7 @@ // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one vldN/vstN instruction. if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(SubVecTy, DL)) + TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } Index: llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -73,11 +73,12 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4( @@ -98,10 +99,10 @@ define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { ; CHECK-NEON-LABEL: @store_factor2( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP3]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor2( @@ -123,11 +124,11 @@ ; CHECK-NEON-LABEL: @store_factor3( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor3( @@ -155,19 +156,26 @@ ; CHECK-NEON-LABEL: @store_factor4( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4( @@ -282,10 +290,10 @@ ; CHECK-NEON-LABEL: @store_ptrvec_factor2( ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i32*> [[V0:%.*]] to <2 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <2 x i32*> [[V1:%.*]] to <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor2( @@ -309,11 +317,11 @@ ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP6]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor3( @@ -343,12 +351,12 @@ ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP7:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP7]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor4( @@ -381,9 +389,10 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_undef_mask_factor2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_undef_mask_factor2( @@ -439,11 +448,12 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_undef_mask_factor4( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_undef_mask_factor4( @@ -464,15 +474,18 @@ define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) { ; CHECK-NEON-LABEL: @store_undef_mask_factor2( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; CHECK-MVE-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 1) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_undef_mask_factor2( @@ -489,11 +502,11 @@ ; CHECK-NEON-LABEL: @store_undef_mask_factor3( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor3( @@ -521,19 +534,26 @@ ; CHECK-NEON-LABEL: @store_undef_mask_factor4( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_undef_mask_factor4( @@ -582,10 +602,10 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) { ; CHECK-NEON-LABEL: @store_address_space( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP3]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 0) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_address_space( @@ -693,12 +713,12 @@ define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4( @@ -718,12 +738,12 @@ define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefbeg( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefbeg( @@ -743,12 +763,12 @@ define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefend( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefend( @@ -768,12 +788,12 @@ define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmid( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmid( @@ -793,12 +813,12 @@ define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmulti( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> ; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmulti( @@ -818,11 +838,11 @@ define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3( @@ -842,11 +862,11 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_undefmultimid( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_undefmultimid( @@ -887,11 +907,11 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_undeflane( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_undeflane( @@ -932,11 +952,11 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_pass( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_pass( @@ -977,11 +997,11 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_pass( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_pass( @@ -1040,9 +1060,16 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]]) +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide2( @@ -1083,9 +1110,24 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide3( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]]) +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP4]], i32 8 +; CHECK-MVE-NEXT: [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP7]]) +; CHECK-MVE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1 +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <12 x i32> +; CHECK-MVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide3( @@ -1163,11 +1205,22 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4_wide( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP6]]) +; CHECK-MVE-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3 +; CHECK-MVE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP7]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4_wide( @@ -1189,20 +1242,28 @@ define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { ; CHECK-NEON-LABEL: @store_factor2_wide( ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 -; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* -; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4) +; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor2_wide( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 1) +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 1) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor2_wide( @@ -1220,17 +1281,17 @@ ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 -; CHECK-NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* -; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4) +; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP6]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor3_wide( @@ -1259,26 +1320,42 @@ ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) ; CHECK-NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 -; CHECK-NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) +; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP7]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP12]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor4_wide( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> -; CHECK-MVE-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3) +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4_wide( @@ -1341,9 +1418,20 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide_pointer( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = inttoptr <4 x i32> [[TMP4]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP6]]) +; CHECK-MVE-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP8:%.*]] = inttoptr <4 x i32> [[TMP7]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> [[TMP8]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP5]], <4 x i32*> [[TMP10]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide_pointer( Index: llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -13,10 +13,10 @@ br label %for.body ; VF_8-LABEL: Checking a loop in "i8_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8: Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_2" ; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 @@ -44,10 +44,10 @@ br label %for.body ; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_2" ; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 @@ -80,10 +80,10 @@ br label %for.body ; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_2" ; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 @@ -266,14 +266,14 @@ ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -311,23 +311,23 @@ ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -365,32 +365,32 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0