Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -660,6 +660,7 @@ /// function checks the vector element type and the overall width of the /// vector. bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, + Align Alignment, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -18868,7 +18868,8 @@ } bool ARMTargetLowering::isLegalInterleavedAccessType( - unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const { + unsigned Factor, FixedVectorType *VecTy, Align Alignment, + const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); @@ -18891,6 +18892,9 @@ // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32) return false; + // And the alignment if high enough under MVE. + if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) + return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. @@ -18931,11 +18935,12 @@ Type *EltTy = VecTy->getElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); + Align Alignment = LI->getAlign(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -19084,11 +19089,12 @@ auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); + Align Alignment = SI->getAlign(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) + if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1370,7 +1370,7 @@ // matched to more than one vldN/vstN instruction. int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1; if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL)) + TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); // Some smaller than legal interleaved patterns are cheap as we can make Index: llvm/test/CodeGen/Thumb2/mve-vld2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -101,9 +101,16 @@ define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vadd.i32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -198,9 +205,41 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vadd.i16 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -489,9 +528,16 @@ define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vadd.f32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -614,10 +660,53 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vldrb.u8 q1, [r0, #16] +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s16, s10 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 1 Index: llvm/test/CodeGen/Thumb2/mve-vld4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -191,18 +191,33 @@ define void @vld4_v4i32_align1(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vldrb.u8 q3, [r0, #32] +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f64 d10, d1 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov.f32 s16, s3 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s4 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vadd.i32 q0, q0, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 1 @@ -373,18 +388,89 @@ define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q4, q2, q3 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q2, [r0, #32] +; CHECK-NEXT: vldrb.u8 q3, [r0, #48] +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vadd.i16 q0, q6, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 1 @@ -965,18 +1051,33 @@ define void @vld4_v4f32_align1(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vldrb.u8 q3, [r0, #32] +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f64 d10, d1 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov.f32 s16, s3 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s4 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vadd.f32 q0, q0, q5 ; CHECK-NEXT: vadd.f32 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 1 @@ -1184,18 +1285,117 @@ define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f16 q4, q2, q3 -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrb.u8 q6, [r0] +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] +; CHECK-NEXT: vldrb.u8 q4, [r0, #32] +; CHECK-NEXT: vldrb.u8 q5, [r0, #48] +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmov r2, s27 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[2], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmovx.f16 s8, s20 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s22 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s24 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vadd.f16 q3, q7, q3 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vadd.f16 q0, q0, q6 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 1 Index: llvm/test/CodeGen/Thumb2/mve-vst2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -100,10 +100,18 @@ define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vst2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d3 +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -199,10 +207,42 @@ define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vst20.16 {q0, q1}, [r1] -; CHECK-NEXT: vst21.16 {q0, q1}, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -453,10 +493,18 @@ define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vst2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d3 +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -585,10 +633,50 @@ define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vst20.16 {q0, q1}, [r1] -; CHECK-NEXT: vst21.16 {q0, q1}, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vstrb.8 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 Index: llvm/test/CodeGen/Thumb2/mve-vst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -203,14 +203,39 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst4_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vdup.32 q6, r0 +; CHECK-NEXT: vmov.f32 s20, s11 +; CHECK-NEXT: vmov.f32 s8, s10 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s21, s7 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vstrb.8 q4, [r1] +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrb.8 q5, [r1, #48] +; CHECK-NEXT: vstrb.8 q2, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -395,14 +420,80 @@ define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.f32 s1, s17 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vstrb.8 q4, [r1] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.f32 s21, s25 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vstrb.8 q5, [r1, #48] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.f32 s25, s9 +; CHECK-NEXT: vmov.f32 s27, s11 +; CHECK-NEXT: vstrb.8 q6, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -917,14 +1008,32 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vst4_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s20, s15 +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f64 d8, d6 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.f32 s18, s0 +; CHECK-NEXT: vmov.f32 s19, s0 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vstrb.8 q4, [r1] +; CHECK-NEXT: vmov.f32 s0, s14 +; CHECK-NEXT: vstrb.8 q5, [r1, #48] +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s2 +; CHECK-NEXT: vstrb.8 q0, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -1129,14 +1238,101 @@ define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmovx.f16 s28, s31 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmovx.f16 s28, s7 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmovx.f16 s20, s22 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q7[2], r0 +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmovx.f16 s20, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.f32 s13, s21 +; CHECK-NEXT: vmov.f32 s5, s29 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vstrb.8 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vstrb.8 q6, [r1] +; CHECK-NEXT: vmov.f32 s7, s31 +; CHECK-NEXT: vstrb.8 q0, [r1, #48] +; CHECK-NEXT: vstrb.8 q1, [r1, #32] +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0