diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13873,6 +13873,33 @@ FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); + // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so + // use the VMOVN over splitting the store. We are looking for patterns of: + // !rev: 0 N 1 N+1 2 N+2 ... + // rev: N 0 N+1 1 N+2 2 ... + auto isVMOVNOriginalMask = [&](ArrayRef M, bool rev) { + unsigned NumElts = ToVT.getVectorNumElements(); + if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)) + return false; + + unsigned Off0 = rev ? NumElts : 0; + unsigned Off1 = rev ? 0 : NumElts; + + for (unsigned i = 0; i < NumElts; i += 2) { + if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + return false; + if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + return false; + } + + return true; + }; + + if (auto *Shuffle = dyn_cast(Trunc->getOperand(0))) + if (isVMOVNOriginalMask(Shuffle->getMask(), false) || + isVMOVNOriginalMask(Shuffle->getMask(), true)) + return SDValue(); + SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -4,16 +4,8 @@ define arm_aapcs_vfpcc void @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn32_trunc1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.32 q2, [r0, #8] -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vmovnt.i32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> @@ -25,16 +17,8 @@ define arm_aapcs_vfpcc void @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn32_trunc2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrh.32 q2, [r0, #8] -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> @@ -46,40 +30,8 @@ define arm_aapcs_vfpcc void @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) { ; CHECK-LABEL: vmovn16_trunc1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vstrb.16 q2, [r0, #8] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> @@ -91,40 +43,8 @@ define arm_aapcs_vfpcc void @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) { ; CHECK-LABEL: vmovn16_trunc2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vstrb.16 q2, [r0, #8] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32>