diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14695,18 +14695,22 @@ // use the VMOVN over splitting the store. We are looking for patterns of: // !rev: 0 N 1 N+1 2 N+2 ... // rev: N 0 N+1 1 N+2 2 ... - auto isVMOVNOriginalMask = [&](ArrayRef M, bool rev) { + // The shuffle may either be a single source (in which case N = NumElts/2) or + // two inputs extended with concat to the same size (in which case N = + // NumElts). + auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { + ArrayRef M = SVN->getMask(); unsigned NumElts = ToVT.getVectorNumElements(); - if (NumElts != M.size()) - return false; + if (SVN->getOperand(1).isUndef()) + NumElts /= 2; - unsigned Off0 = rev ? NumElts : 0; - unsigned Off1 = rev ? 0 : NumElts; + unsigned Off0 = Rev ? NumElts : 0; + unsigned Off1 = Rev ? 0 : NumElts; - for (unsigned i = 0; i < NumElts; i += 2) { - if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + for (unsigned I = 0; I < NumElts; I += 2) { + if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) return false; - if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) return false; } @@ -14721,9 +14725,8 @@ return SDValue(); } } - if (auto *Shuffle = dyn_cast(Trunc->getOperand(0))) - if (isVMOVNOriginalMask(Shuffle->getMask(), false) || - isVMOVNOriginalMask(Shuffle->getMask(), true)) + if (auto *Shuffle = dyn_cast(Trunc.getOperand(0))) + if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) return SDValue(); LLVMContext &C = *DAG.getContext(); diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -30,16 +30,8 @@ define arm_aapcs_vfpcc void @vmovn32_trunc1_onesrc(<8 x i32> %src1, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn32_trunc1_onesrc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.32 q2, [r0, #8] -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vmovnt.i32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i32> %src1, <8 x i32> undef, <8 x i32> @@ -51,16 +43,8 @@ define arm_aapcs_vfpcc void @vmovn32_trunc2_onesrc(<8 x i32> %src1, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn32_trunc2_onesrc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrh.32 q2, [r0, #8] -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i32> %src1, <8 x i32> undef, <8 x i32> @@ -98,40 +82,8 @@ define arm_aapcs_vfpcc void @vmovn16_trunc1_onesrc(<16 x i16> %src1, <16 x i8> *%dest) { ; CHECK-LABEL: vmovn16_trunc1_onesrc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vstrb.16 q2, [r0, #8] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i16> %src1, <16 x i16> undef, <16 x i32> @@ -143,40 +95,8 @@ define arm_aapcs_vfpcc void @vmovn16_trunc2_onesrc(<16 x i16> %src1, <16 x i8> *%dest) { ; CHECK-LABEL: vmovn16_trunc2_onesrc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vstrb.16 q2, [r0, #8] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i16> %src1, <16 x i16> undef, <16 x i32>