diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20887,6 +20887,32 @@ continue; } + // Last chance - see if the vector is another shuffle and if it + // uses one of the existing candidate shuffle ops. + if (auto *CurrentSVN = dyn_cast(CurrentVec)) { + int InnerIdx = CurrentSVN->getMaskElt(Idx); + if (InnerIdx < 0) { + Mask.push_back(-1); + continue; + } + SDValue InnerVec = (InnerIdx < (int)NumElts) + ? CurrentSVN->getOperand(0) + : CurrentSVN->getOperand(1); + if (InnerVec.isUndef()) { + Mask.push_back(-1); + continue; + } + InnerIdx %= NumElts; + if (InnerVec == SV0) { + Mask.push_back(InnerIdx); + continue; + } + if (InnerVec == SV1) { + Mask.push_back(InnerIdx + NumElts); + continue; + } + } + // Bail out if we cannot convert the shuffle pair into a single shuffle. return false; } diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -10,37 +10,26 @@ ; CHECK-NEXT: blt .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmullt.s32 q0, q2, q1 -; CHECK-NEXT: vmullb.s32 q3, q2, q1 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: vmullb.s32 q2, q1, q0 +; CHECK-NEXT: vmullt.s32 q3, q1, q0 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: vmov r12, s10 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 -; CHECK-NEXT: vmov r12, s14 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 ; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov r7, s13 +; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: lsrl r12, r5, #31 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r12 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,19 +6,10 @@ define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd r2, r0, [r0, #8] -; CHECK-NEXT: vmov q0[2], q0[0], r12, r3 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: ldm.w r0, {r2, r3, r12} +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -333,11 +324,9 @@ ; CHECK-LABEL: vst2_v2f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr s0, [r0] -; CHECK-NEXT: vldr s4, [r0, #4] +; CHECK-NEXT: vldr s2, [r0, #4] ; CHECK-NEXT: vldr s1, [r0, #8] -; CHECK-NEXT: vldr s5, [r0, #12] -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vldr s3, [r0, #12] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: