diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -604,7 +604,7 @@ /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. - bool isLegalInterleavedAccessType(VectorType *VecTy, + bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16681,15 +16681,20 @@ } bool ARMTargetLowering::isLegalInterleavedAccessType( - VectorType *VecTy, const DataLayout &DL) const { + unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) + return false; + // Ensure the vector doesn't have f16 elements. Even though we could do an // i16 vldN, we can't hold the f16 vectors and will end up converting via // f32. - if (VecTy->getElementType()->isHalfTy()) + if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) + return false; + if (Subtarget->hasMVEIntegerOps() && Factor == 3) return false; // Ensure the number of vector elements is greater than 1. @@ -16702,12 +16707,16 @@ // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. - return VecSize == 64 || VecSize % 128 == 0; + if (Subtarget->hasNEON() && VecSize == 64) + return true; + return VecSize % 128 == 0; } unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { if (Subtarget->hasNEON()) return 4; + if (Subtarget->hasMVEIntegerOps()) + return 4; return TargetLoweringBase::getMaxSupportedInterleaveFactor(); } @@ -16739,7 +16748,7 @@ // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -16771,13 +16780,37 @@ assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Type *Tys[] = {VecTy, Int8Ptr}; - static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, - Intrinsic::arm_neon_vld3, - Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + auto createLoadIntrinsic = [&](Value *BaseAddr) { + if (Subtarget->hasNEON()) { + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, + Intrinsic::arm_neon_vld4}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } else { + assert((Factor == 2 || Factor == 4) && + "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID LoadInts = + Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; + Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, VecEltTy}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); + return Builder.CreateCall(VldnFunc, Ops, "vldN"); + } + }; // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will @@ -16792,11 +16825,7 @@ Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, VecTy->getVectorNumElements() * Factor); - SmallVector Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); - - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + CallInst *VldN = createLoadIntrinsic(BaseAddr); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. @@ -16875,7 +16904,7 @@ // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); @@ -16919,11 +16948,46 @@ auto Mask = SVI->getShuffleMask(); - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Type *Tys[] = {Int8Ptr, SubVecTy}; - static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; + auto createStoreIntrinsic = [&](Value *BaseAddr, + SmallVectorImpl &Shuffles) { + if (Subtarget->hasNEON()) { + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; + + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + for (auto S : Shuffles) + Ops.push_back(S); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } else { + assert((Factor == 2 || Factor == 4) && + "expected interleave factor of 2 or 4 for MVE"); + Intrinsic::ID StoreInts = + Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; + Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace()); + Type *Tys[] = {EltPtrTy, SubVecTy}; + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); + for (auto S : Shuffles) + Ops.push_back(S); + for (unsigned F = 0; F < Factor; F++) { + Ops.push_back(Builder.getInt32(F)); + Builder.CreateCall(VstNFunc, Ops); + Ops.pop_back(); + } + } + }; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { // If we generating more than one store, we compute the base address of @@ -16932,17 +16996,13 @@ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); - SmallVector Ops; - Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - - Function *VstNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + SmallVector Shuffles; // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; @@ -16959,13 +17019,12 @@ // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( + Shuffles.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + createStoreIntrinsic(BaseAddr, Shuffles); } return true; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -755,13 +755,10 @@ return BaseCost * LT.first; } -int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int ARMTTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); @@ -776,9 +773,19 @@ // vldN/vstN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one vldN/vstN instruction. + int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1; if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(SubVecTy, DL)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); + TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) + return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); + + // Some smaller than legal interleaved patterns are cheap as we can make + // use of the vmovn or vrev patterns to interleave a standard load. This is + // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is + // promoted differently). The cost of 2 here is then a load and vrev or + // vmovn. + if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && + VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64) + return 2 * BaseCost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -28,16 +28,9 @@ define void @vld2_v4i32(<8 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld2_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vadd.i32 q0, q1, q2 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -52,28 +45,15 @@ define void @vld2_v8i32(<16 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld2_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: add.w r2, r0, #32 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: vld20.32 {q2, q3}, [r2] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q2, q3}, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q1, q2, q3 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -87,53 +67,29 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vld2_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vadd.i32 q0, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vadd.i32 q1, q2, q3 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vadd.i32 q2, q2, q5 -; CHECK-NEXT: vmov.f32 s20, s17 -; CHECK-NEXT: vmov.f32 s21, s19 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: add.w r12, r0, #96 +; CHECK-NEXT: add.w r3, r0, #32 +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vld20.32 {q1, q2}, [r2] +; CHECK-NEXT: vld20.32 {q3, q4}, [r12] +; CHECK-NEXT: vld20.32 {q5, q6}, [r3] +; CHECK-NEXT: vld21.32 {q5, q6}, [r3] +; CHECK-NEXT: vld21.32 {q1, q2}, [r2] +; CHECK-NEXT: vld21.32 {q3, q4}, [r12] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vadd.i32 q3, q4, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q6 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -189,41 +145,9 @@ define void @vld2_v8i16(<16 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld2_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q3, q0 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -238,81 +162,15 @@ define void @vld2_v16i16(<32 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld2_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.i16 q0, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vadd.i16 q1, q1, q4 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add.w r2, r0, #32 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld20.16 {q2, q3}, [r2] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q2, q3}, [r2] +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vadd.i16 q1, q2, q3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 4 @@ -385,73 +243,9 @@ define void @vld2_v16i8(<32 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld2_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.8 q0[3], r2 -; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.8 q0[5], r2 -; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: vmov.8 q0[7], r2 -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q3[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q3[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q3[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q3[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q3[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q3[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vld20.8 {q0, q1}, [r0] +; CHECK-NEXT: vld21.8 {q0, q1}, [r0] +; CHECK-NEXT: vadd.i8 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -597,16 +391,9 @@ define void @vld2_v4f32(<8 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld2_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vadd.f32 q0, q1, q2 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -621,28 +408,15 @@ define void @vld2_v8f32(<16 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld2_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vadd.f32 q0, q0, q3 -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: add.w r2, r0, #32 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: vld20.32 {q2, q3}, [r2] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q2, q3}, [r2] +; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q1, q2, q3 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -656,53 +430,29 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vld2_v16f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vadd.f32 q0, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s15, s7 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vadd.f32 q1, q2, q3 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vadd.f32 q2, q2, q5 -; CHECK-NEXT: vmov.f32 s20, s17 -; CHECK-NEXT: vmov.f32 s21, s19 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: add.w r12, r0, #96 +; CHECK-NEXT: add.w r3, r0, #32 +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vld20.32 {q1, q2}, [r2] +; CHECK-NEXT: vld20.32 {q3, q4}, [r12] +; CHECK-NEXT: vld20.32 {q5, q6}, [r3] +; CHECK-NEXT: vld21.32 {q5, q6}, [r3] +; CHECK-NEXT: vld21.32 {q1, q2}, [r2] +; CHECK-NEXT: vld21.32 {q3, q4}, [r12] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vadd.f32 q3, q4, q5 +; CHECK-NEXT: vadd.f32 q5, q5, q6 +; CHECK-NEXT: vadd.f32 q1, q1, q2 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -785,53 +535,10 @@ define void @vld2_v8f16(<16 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld2_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 4 @@ -845,97 +552,15 @@ define void @vld2_v16f16(<32 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld2_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vadd.f16 q1, q0, q3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.f16 q0, q3, q1 +; CHECK-NEXT: add.w r2, r0, #32 +; CHECK-NEXT: vld20.16 {q0, q1}, [r2] +; CHECK-NEXT: vld21.16 {q0, q1}, [r2] +; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vld20.16 {q1, q2}, [r0] +; CHECK-NEXT: vld21.16 {q1, q2}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vadd.f16 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -50,33 +50,18 @@ define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -94,57 +79,38 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #96] -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q6, q7 +; CHECK-NEXT: vadd.i32 q5, q1, q0 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q5, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q0, q0, q5 -; CHECK-NEXT: vmov.f64 d12, d3 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov.f32 s25, s14 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vadd.i32 q5, q6, q5 -; CHECK-NEXT: vmov.f32 s26, s17 -; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vadd.i32 q1, q1, q6 -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -162,110 +128,80 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vld4_v16i32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #112] -; CHECK-NEXT: vadd.i32 q0, q0, q5 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d0, d3 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s26, s17 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s1, s14 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vadd.i32 q5, q0, q5 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #176] -; CHECK-NEXT: vadd.i32 q0, q1, q6 -; CHECK-NEXT: vadd.i32 q1, q0, q5 -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s23, s14 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s27, s15 -; CHECK-NEXT: vmov.f32 s31, s13 -; CHECK-NEXT: vmov.f32 s24, s11 -; CHECK-NEXT: vmov.f32 s28, s9 -; CHECK-NEXT: vmov.f32 s1, s18 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov.f32 s11, s12 -; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vmov.f32 s25, s19 -; CHECK-NEXT: vmov.f32 s29, s17 -; CHECK-NEXT: vadd.i32 q6, q0, q6 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #240] -; CHECK-NEXT: vadd.i32 q0, q2, q7 -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vadd.i32 q2, q0, q6 -; CHECK-NEXT: vldrw.u32 q6, [r0, #224] -; CHECK-NEXT: vldrw.u32 q0, [r0, #208] -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s31, s19 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s28, s15 -; CHECK-NEXT: vmov.f32 s27, s18 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s29, s3 -; CHECK-NEXT: vmov.f32 s23, s18 -; CHECK-NEXT: vadd.i32 q7, q5, q7 -; CHECK-NEXT: vmov.f32 s22, s25 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s14, s24 -; CHECK-NEXT: vmov.f32 s15, s16 -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vadd.i32 q0, q3, q5 +; CHECK-NEXT: .pad #152 +; CHECK-NEXT: sub sp, #152 +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: adds r0, #192 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q4, q5, q0 +; CHECK-NEXT: vldmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q0, q0, q7 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q0, q4, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q5, q6, q0 +; CHECK-NEXT: vldmia r0, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vadd.i32 q5, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #152 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i32>, <64 x i32>* %src, align 4 @@ -377,145 +313,18 @@ define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q7[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vadd.i16 q0, q5, q0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 4 @@ -533,286 +342,38 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u16 r2, q5[7] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q4, q6, q7 +; CHECK-NEXT: vadd.i16 q5, q1, q0 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q7[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q7[3], r2 -; CHECK-NEXT: vmov.u16 r2, q7[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q7[1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q7[3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vadd.i16 q0, q5, q0 -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vadd.i16 q5, q5, q7 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vadd.i16 q0, q1, q6 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i16 q5, q2, q3 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vadd.i16 q0, q0, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, <64 x i16>* %src, align 4 @@ -988,273 +549,18 @@ define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.8 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: vmov.8 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.8 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.8 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.8 q2[4], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.8 q2[5], r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.8 q2[6], r2 -; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: vmov.8 q2[7], r2 -; CHECK-NEXT: vmov.u8 r2, q2[0] -; CHECK-NEXT: vmov.8 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q2[1] -; CHECK-NEXT: vmov.8 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q2[2] -; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q2[4] -; CHECK-NEXT: vmov.8 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.8 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.8 q4[7], r2 -; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q5[8], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q5[9], r2 -; CHECK-NEXT: vmov.u8 r2, q2[11] -; CHECK-NEXT: vmov.8 q5[10], r2 -; CHECK-NEXT: vmov.u8 r2, q2[15] -; CHECK-NEXT: vmov.8 q5[11], r2 -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q3[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q5[8] -; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov.u8 r0, q5[9] -; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov.u8 r0, q5[10] -; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov.u8 r0, q5[11] -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov.u8 r0, q5[12] -; CHECK-NEXT: vmov.8 q4[12], r0 -; CHECK-NEXT: vmov.u8 r0, q5[13] -; CHECK-NEXT: vmov.8 q4[13], r0 -; CHECK-NEXT: vmov.u8 r0, q5[14] -; CHECK-NEXT: vmov.8 q4[14], r0 -; CHECK-NEXT: vmov.u8 r0, q5[15] -; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q6[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q6[7], r0 -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q6[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q6[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.8 q6[12], r0 -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.8 q6[13], r0 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.8 q6[14], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.8 q6[15], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q6[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q6[7], r0 -; CHECK-NEXT: vadd.i8 q4, q5, q4 -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q6[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q6[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.8 q6[12], r0 -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q6[13], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q6[14], r0 -; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov.8 q6[15], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q7[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q7[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.8 q7[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q7[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q7[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q7[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q7[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q7[7], r0 -; CHECK-NEXT: vmov.u8 r0, q7[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q7[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q7[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q7[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q7[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q7[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q7[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q7[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q1[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q1[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q1[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] -; CHECK-NEXT: vmov.8 q1[12], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.8 q1[13], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q1[14], r0 -; CHECK-NEXT: vmov.u8 r0, q3[13] -; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vadd.i8 q0, q5, q0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i8 q4, q2, q3 +; CHECK-NEXT: vadd.i8 q0, q0, q1 ; CHECK-NEXT: vadd.i8 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i8>, <64 x i8>* %src, align 4 @@ -1510,33 +816,18 @@ define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -1554,57 +845,38 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #96] -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q4, q6, q7 +; CHECK-NEXT: vadd.f32 q5, q1, q0 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q5, q2, q3 +; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q0, q0, q5 -; CHECK-NEXT: vmov.f64 d12, d3 -; CHECK-NEXT: vadd.f32 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov.f32 s25, s14 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vadd.f32 q5, q6, q5 -; CHECK-NEXT: vmov.f32 s26, s17 -; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vadd.f32 q1, q1, q6 -; CHECK-NEXT: vadd.f32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -1622,110 +894,80 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vld4_v16f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f64 d10, d1 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #112] -; CHECK-NEXT: vadd.f32 q0, q0, q5 -; CHECK-NEXT: vadd.f32 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d0, d3 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s26, s17 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmov.f32 s1, s14 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vadd.f32 q5, q0, q5 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #176] -; CHECK-NEXT: vadd.f32 q0, q1, q6 -; CHECK-NEXT: vadd.f32 q1, q0, q5 -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s23, s14 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s27, s15 -; CHECK-NEXT: vmov.f32 s31, s13 -; CHECK-NEXT: vmov.f32 s24, s11 -; CHECK-NEXT: vmov.f32 s28, s9 -; CHECK-NEXT: vmov.f32 s1, s18 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov.f32 s11, s12 -; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vmov.f32 s25, s19 -; CHECK-NEXT: vmov.f32 s29, s17 -; CHECK-NEXT: vadd.f32 q6, q0, q6 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #240] -; CHECK-NEXT: vadd.f32 q0, q2, q7 -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vadd.f32 q2, q0, q6 -; CHECK-NEXT: vldrw.u32 q6, [r0, #224] -; CHECK-NEXT: vldrw.u32 q0, [r0, #208] -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s31, s19 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s28, s15 -; CHECK-NEXT: vmov.f32 s27, s18 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s29, s3 -; CHECK-NEXT: vmov.f32 s23, s18 -; CHECK-NEXT: vadd.f32 q7, q5, q7 -; CHECK-NEXT: vmov.f32 s22, s25 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s14, s24 -; CHECK-NEXT: vmov.f32 s15, s16 -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vadd.f32 q0, q3, q5 +; CHECK-NEXT: .pad #152 +; CHECK-NEXT: sub sp, #152 +; CHECK-NEXT: add.w r2, r0, #128 +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: adds r0, #192 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q4, q5, q0 +; CHECK-NEXT: vldmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vstmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 q0, q0, q7 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q0, q4, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q4, q4, q2 +; CHECK-NEXT: vadd.f32 q5, q6, q0 +; CHECK-NEXT: vldmia r0, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vadd.f32 q5, q2, q3 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #152 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x float>, <64 x float>* %src, align 4 @@ -1856,97 +1098,18 @@ define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmovx.f16 s24, s3 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s9 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s15 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vadd.f16 q4, q4, q5 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vadd.f16 q0, q1, q5 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 4 @@ -1964,185 +1127,28 @@ define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld4_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmovx.f16 s24, s3 -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s9 -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s15 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s0 -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vadd.f16 q4, q4, q5 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vadd.f16 q0, q1, q5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.f16 q4, q0, q4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s1 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s3 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vadd.f16 q4, q5, q4 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s14 -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vadd.f16 q0, q5, q3 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: add.w r2, r0, #64 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vadd.f16 q0, q3, q4 +; CHECK-NEXT: vadd.f16 q1, q1, q2 +; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x half>, <64 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -36,18 +36,10 @@ define void @vst2_v4i32(<4 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vst2_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vst20.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -62,31 +54,15 @@ define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst2_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s17 -; CHECK-NEXT: vmov.f32 s15, s1 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add.w r0, r1, #32 +; CHECK-NEXT: vst20.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 @@ -103,57 +79,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s29, s16 -; CHECK-NEXT: vmov.f32 s30, s1 -; CHECK-NEXT: vmov.f32 s31, s17 -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d10 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s31, s5 -; CHECK-NEXT: vmov.f32 s4, s22 -; CHECK-NEXT: vstrw.32 q7, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmov.f64 d10, d12 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s22, s25 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s9 -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f32 s8, s26 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s27 -; CHECK-NEXT: vmov.f64 d12, d0 -; CHECK-NEXT: vstrw.32 q2, [r1, #80] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s26, s1 -; CHECK-NEXT: vmov.f32 s27, s13 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vstrw.32 q6, [r1, #96] -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vldrw.u32 q3, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: add.w r0, r1, #96 +; CHECK-NEXT: add.w r2, r1, #64 +; CHECK-NEXT: add.w r3, r1, #32 +; CHECK-NEXT: vst20.32 {q6, q7}, [r1] +; CHECK-NEXT: vst21.32 {q6, q7}, [r1] +; CHECK-NEXT: vst20.32 {q4, q5}, [r3] +; CHECK-NEXT: vst21.32 {q4, q5}, [r3] +; CHECK-NEXT: vst20.32 {q2, q3}, [r2] +; CHECK-NEXT: vst21.32 {q2, q3}, [r2] +; CHECK-NEXT: vst20.32 {q0, q1}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r0] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -212,42 +156,10 @@ define void @vst2_v8i16(<8 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst2_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vst20.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -262,81 +174,15 @@ define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst2_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: add.w r0, r1, #32 +; CHECK-NEXT: vst20.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1] +; CHECK-NEXT: vst20.16 {q2, q3}, [r0] +; CHECK-NEXT: vst21.16 {q2, q3}, [r0] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 @@ -412,74 +258,10 @@ define void @vst2_v16i8(<16 x i8> *%src, <32 x i8> *%dst) { ; CHECK-LABEL: vst2_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q3[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q3[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q3[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q3[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q3[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q3[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vst20.8 {q0, q1}, [r1] +; CHECK-NEXT: vst21.8 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -581,18 +363,10 @@ define void @vst2_v4f32(<4 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vst2_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vst20.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -607,31 +381,15 @@ define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vst2_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s17 -; CHECK-NEXT: vmov.f32 s15, s1 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add.w r0, r1, #32 +; CHECK-NEXT: vst20.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 @@ -648,57 +406,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s29, s16 -; CHECK-NEXT: vmov.f32 s30, s1 -; CHECK-NEXT: vmov.f32 s31, s17 -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d10 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s31, s5 -; CHECK-NEXT: vmov.f32 s4, s22 -; CHECK-NEXT: vstrw.32 q7, [r1, #32] -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmov.f64 d10, d12 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s22, s25 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s9 -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f32 s8, s26 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s27 -; CHECK-NEXT: vmov.f64 d12, d0 -; CHECK-NEXT: vstrw.32 q2, [r1, #80] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s26, s1 -; CHECK-NEXT: vmov.f32 s27, s13 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vstrw.32 q6, [r1, #96] -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vldrw.u32 q3, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: add.w r0, r1, #96 +; CHECK-NEXT: add.w r2, r1, #64 +; CHECK-NEXT: add.w r3, r1, #32 +; CHECK-NEXT: vst20.32 {q6, q7}, [r1] +; CHECK-NEXT: vst21.32 {q6, q7}, [r1] +; CHECK-NEXT: vst20.32 {q4, q5}, [r3] +; CHECK-NEXT: vst21.32 {q4, q5}, [r3] +; CHECK-NEXT: vst20.32 {q2, q3}, [r2] +; CHECK-NEXT: vst21.32 {q2, q3}, [r2] +; CHECK-NEXT: vst20.32 {q0, q1}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r0] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -787,50 +513,10 @@ define void @vst2_v8f16(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vst20.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -845,97 +531,15 @@ define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst2_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmovx.f16 s20, s14 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s12, s15 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q4, [r1, #48] -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: add.w r0, r1, #32 +; CHECK-NEXT: vst20.16 {q2, q3}, [r1] +; CHECK-NEXT: vst21.16 {q2, q3}, [r1] +; CHECK-NEXT: vst20.16 {q0, q1}, [r0] +; CHECK-NEXT: vst21.16 {q0, q1}, [r0] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -50,39 +50,14 @@ define void @vst4_v4i32(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.32 r0, q3[1] -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.32 r0, q3[0] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s20, s11 -; CHECK-NEXT: vmov.32 r0, q3[2] -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -105,75 +80,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.32 r2, q5[0] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[1] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[2] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s5, s18 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s12, s15 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[3] -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #48] -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov.32 r0, q2[0] -; CHECK-NEXT: vmov.f64 d10, d14 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s21, s24 -; CHECK-NEXT: vmov.32 r0, q2[1] -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f64 d8, d15 -; CHECK-NEXT: vmov.32 r0, q2[2] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.32 r0, q2[3] -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s0, s29 -; CHECK-NEXT: vmov.f32 s17, s26 -; CHECK-NEXT: vmov.f32 s24, s29 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s4, s31 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vstrw.32 q1, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -195,155 +118,68 @@ define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) { ; CHECK-LABEL: vst4_v16i32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.32 r2, q1[2] -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vdup.32 q3, r2 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q6, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [r0, #96] -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vstrw.32 q6, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vstrw.32 q4, [r1, #224] -; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.32 r0, q1[3] -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #240] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.32 r0, q1[0] -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #192] -; CHECK-NEXT: vmov.32 r0, q1[1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vldrw.u32 q6, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #208] -; CHECK-NEXT: vmov.f64 d0, d9 -; CHECK-NEXT: vmov.32 r0, q2[2] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s1, s26 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vmov.f32 s0, s19 -; CHECK-NEXT: vmov.32 r0, q2[3] -; CHECK-NEXT: vmov.f32 s1, s27 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s8, s15 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vmov.32 r0, q5[0] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q5[1] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s28, s13 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s29 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q5[2] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q5[3] -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s30 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s9, s31 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.32 r0, q3[0] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q3[1] -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.32 r0, q3[2] -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f32 s17, s22 -; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s4, s11 -; CHECK-NEXT: vmov.f32 s28, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s5, s23 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s21 -; CHECK-NEXT: vmov.32 r0, q2[0] -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.32 r0, q2[1] -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.f32 s3, s23 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vstrw.32 q0, [r1, #128] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vstrw.32 q1, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #144] -; CHECK-NEXT: vstrw.32 q7, [r1, #80] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: .pad #152 +; CHECK-NEXT: sub sp, #152 +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #144] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: add.w r0, r1, #192 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: adds r1, #128 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: add sp, #152 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0 @@ -462,136 +298,14 @@ define void @vst4_v8i16(<8 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.u16 r2, q4[0] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r2, q4[1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.u16 r0, q7[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -614,272 +328,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q6, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[0] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q4[1] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[2] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q4[3] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[4] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q4[5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[6] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q4[7] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q7[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q7[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q7[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.16 q7[7], r2 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vstrw.32 q4, [r1, #80] -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vstrw.32 q2, [r1, #96] -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vstrw.32 q6, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1079,256 +544,14 @@ define void @vst4_v16i8(<16 x i8> *%src, <64 x i8> *%dst) { ; CHECK-LABEL: vst4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov.8 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q2[4] -; CHECK-NEXT: vmov.8 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q4[4], r2 -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.8 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.8 q4[9], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.8 q4[12], r2 -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q4[13], r2 -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov.u8 r0, q5[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q5[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q5[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q4[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q5[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q5[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q4[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q4[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q5[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q5[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q5[1] -; CHECK-NEXT: vmov.8 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.8 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] -; CHECK-NEXT: vmov.8 q6[6], r0 -; CHECK-NEXT: vmov.8 q6[7], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.8 q6[10], r0 -; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.8 q6[14], r0 -; CHECK-NEXT: vmov.8 q6[15], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] -; CHECK-NEXT: vmov.8 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] -; CHECK-NEXT: vmov.8 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[4] -; CHECK-NEXT: vmov.8 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q5[5] -; CHECK-NEXT: vmov.8 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] -; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] -; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q5[8] -; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov.u8 r0, q5[9] -; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] -; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov.u8 r0, q5[12] -; CHECK-NEXT: vmov.8 q4[12], r0 -; CHECK-NEXT: vmov.u8 r0, q5[13] -; CHECK-NEXT: vmov.8 q4[13], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] -; CHECK-NEXT: vmov.8 q4[14], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] -; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q6[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q6[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q6[13], r0 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov.8 q7[2], r0 -; CHECK-NEXT: vmov.8 q7[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[13] -; CHECK-NEXT: vmov.8 q7[6], r0 -; CHECK-NEXT: vmov.8 q7[7], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.8 q7[10], r0 -; CHECK-NEXT: vmov.8 q7[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[15] -; CHECK-NEXT: vmov.8 q7[14], r0 -; CHECK-NEXT: vmov.8 q7[15], r0 -; CHECK-NEXT: vmov.u8 r0, q7[2] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q7[3] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q7[6] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q7[7] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q7[10] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q7[11] -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q7[14] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q7[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q6[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q6[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q6[13], r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vmov.8 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.8 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.8 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.8 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.8 q1[8], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] -; CHECK-NEXT: vmov.8 q1[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q1[11], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] -; CHECK-NEXT: vmov.8 q1[12], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] -; CHECK-NEXT: vmov.8 q1[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q1[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -1483,39 +706,14 @@ define void @vst4_v4f32(<4 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vst4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.32 r0, q3[1] -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.32 r0, q3[0] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s20, s11 -; CHECK-NEXT: vmov.32 r0, q3[2] -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -1538,75 +736,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.32 r2, q5[0] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[1] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[2] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s5, s18 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s12, s15 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r2, q5[3] -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #48] -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov.32 r0, q2[0] -; CHECK-NEXT: vmov.f64 d10, d14 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s21, s24 -; CHECK-NEXT: vmov.32 r0, q2[1] -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f64 d8, d15 -; CHECK-NEXT: vmov.32 r0, q2[2] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.32 r0, q2[3] -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s0, s29 -; CHECK-NEXT: vmov.f32 s17, s26 -; CHECK-NEXT: vmov.f32 s24, s29 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s4, s31 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vstrw.32 q1, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1628,155 +774,68 @@ define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) { ; CHECK-LABEL: vst4_v16f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.32 r2, q1[2] -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vdup.32 q3, r2 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q6, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [r0, #96] -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vstrw.32 q6, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vstrw.32 q4, [r1, #224] -; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.32 r0, q1[3] -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #240] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.32 r0, q1[0] -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s27 -; CHECK-NEXT: vstrw.32 q4, [r1, #192] -; CHECK-NEXT: vmov.32 r0, q1[1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vldrw.u32 q6, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #208] -; CHECK-NEXT: vmov.f64 d0, d9 -; CHECK-NEXT: vmov.32 r0, q2[2] -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s1, s26 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vmov.f32 s0, s19 -; CHECK-NEXT: vmov.32 r0, q2[3] -; CHECK-NEXT: vmov.f32 s1, s27 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s8, s15 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vmov.32 r0, q5[0] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q5[1] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s28, s13 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s29 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 r0, q5[2] -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q5[3] -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s30 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s9, s31 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.32 r0, q3[0] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.32 r0, q3[1] -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.32 r0, q3[2] -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.32 r0, q3[3] -; CHECK-NEXT: vmov.f32 s17, s22 -; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s4, s11 -; CHECK-NEXT: vmov.f32 s28, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s5, s23 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s12, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s21 -; CHECK-NEXT: vmov.32 r0, q2[0] -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.32 r0, q2[1] -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.f32 s3, s23 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vstrw.32 q0, [r1, #128] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vstrw.32 q1, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #144] -; CHECK-NEXT: vstrw.32 q7, [r1, #80] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: .pad #152 +; CHECK-NEXT: sub sp, #152 +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #144] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: add.w r0, r1, #192 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: adds r1, #128 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: add sp, #152 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0 @@ -1936,84 +995,14 @@ define void @vst4_v8f16(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmovx.f16 s16, s11 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -2034,159 +1023,26 @@ define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) { ; CHECK-LABEL: vst4_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vstrw.32 q6, [r1, #96] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmovx.f16 s28, s23 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s15 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s19 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmovx.f16 s28, s20 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vstrw.32 q6, [r1, #112] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s16 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmovx.f16 s20, s21 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmovx.f16 s12, s13 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s17 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s16, s11 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s4 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: add.w r0, r1, #64 +; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 diff --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll --- a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -73,11 +73,12 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4( @@ -98,10 +99,10 @@ define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { ; CHECK-NEON-LABEL: @store_factor2( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP3]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor2( @@ -123,11 +124,11 @@ ; CHECK-NEON-LABEL: @store_factor3( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor3( @@ -155,19 +156,26 @@ ; CHECK-NEON-LABEL: @store_factor4( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4( @@ -282,10 +290,10 @@ ; CHECK-NEON-LABEL: @store_ptrvec_factor2( ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i32*> [[V0:%.*]] to <2 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <2 x i32*> [[V1:%.*]] to <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor2( @@ -309,11 +317,11 @@ ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP6]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor3( @@ -343,12 +351,12 @@ ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32> ; CHECK-NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4) +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP7:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP7]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_ptrvec_factor4( @@ -381,9 +389,10 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_undef_mask_factor2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_undef_mask_factor2( @@ -439,11 +448,12 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_undef_mask_factor4( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_undef_mask_factor4( @@ -464,15 +474,18 @@ define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) { ; CHECK-NEON-LABEL: @store_undef_mask_factor2( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; CHECK-MVE-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 1) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_undef_mask_factor2( @@ -489,11 +502,11 @@ ; CHECK-NEON-LABEL: @store_undef_mask_factor3( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor3( @@ -521,19 +534,26 @@ ; CHECK-NEON-LABEL: @store_undef_mask_factor4( ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_undef_mask_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_undef_mask_factor4( @@ -582,10 +602,10 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) { ; CHECK-NEON-LABEL: @store_address_space( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP3]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 0) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_address_space( @@ -693,12 +713,12 @@ define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4( @@ -718,12 +738,12 @@ define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefbeg( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefbeg( @@ -743,12 +763,12 @@ define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefend( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefend( @@ -768,12 +788,12 @@ define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmid( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmid( @@ -793,12 +813,12 @@ define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmulti( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> ; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmulti( @@ -818,11 +838,11 @@ define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3( @@ -842,11 +862,11 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_undefmultimid( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_undefmultimid( @@ -887,11 +907,11 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_undeflane( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_undeflane( @@ -932,11 +952,11 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_pass( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_pass( @@ -977,11 +997,11 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) { ; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_pass( -; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_pass( @@ -1040,9 +1060,16 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide2( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]]) +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide2( @@ -1083,9 +1110,24 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide3( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]]) +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP4]], i32 8 +; CHECK-MVE-NEXT: [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP7]]) +; CHECK-MVE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1 +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <12 x i32> +; CHECK-MVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide3( @@ -1163,11 +1205,22 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4_wide( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP6]]) +; CHECK-MVE-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3 +; CHECK-MVE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP7]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4_wide( @@ -1189,20 +1242,28 @@ define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { ; CHECK-NEON-LABEL: @store_factor2_wide( ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 -; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* -; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4) +; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor2_wide( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> -; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 1) +; CHECK-MVE-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 1) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor2_wide( @@ -1220,17 +1281,17 @@ ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 -; CHECK-NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* -; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4) +; CHECK-NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP6]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor3_wide( @@ -1259,26 +1320,42 @@ ; CHECK-NEON-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> ; CHECK-NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* -; CHECK-NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4) +; CHECK-NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) ; CHECK-NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 -; CHECK-NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) +; CHECK-NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP7]] to i8* +; CHECK-NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP12]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4) ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @store_factor4_wide( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> -; CHECK-MVE-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3) +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; CHECK-MVE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 0) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 1) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 2) +; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3) ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4_wide( @@ -1341,9 +1418,20 @@ ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor2_wide_pointer( -; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4 -; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> -; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> +; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32* +; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]]) +; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; CHECK-MVE-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[TMP5:%.*]] = inttoptr <4 x i32> [[TMP4]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP6]]) +; CHECK-MVE-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; CHECK-MVE-NEXT: [[TMP8:%.*]] = inttoptr <4 x i32> [[TMP7]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; CHECK-MVE-NEXT: [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*> +; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> [[TMP8]], <8 x i32> +; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP5]], <4 x i32*> [[TMP10]], <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor2_wide_pointer( diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -20,20 +20,20 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_4-LABEL: Checking a loop in "i8_factor_2" -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_8-LABEL: Checking a loop in "i8_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_2" -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 @@ -61,20 +61,20 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_2" -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -97,25 +97,25 @@ br label %for.body ; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_2" -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_2" -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -189,15 +189,15 @@ ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_2" -; VF_8: Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_2" -; VF_16: Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 544 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 0 @@ -220,25 +220,25 @@ br label %for.body ; VF_2-LABEL: Checking a loop in "f32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4 +; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4 +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_2" -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_2" -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 0 @@ -700,14 +700,14 @@ ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -754,23 +754,23 @@ ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -808,32 +808,32 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0 @@ -943,23 +943,23 @@ ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0 @@ -997,32 +997,32 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0