diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp --- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp +++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp @@ -193,6 +193,36 @@ Exts.insert(I); break; + case Instruction::Call: { + IntrinsicInst *II = dyn_cast(I); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::abs: + case Intrinsic::smin: + case Intrinsic::smax: + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::fabs: + case Intrinsic::fma: + case Intrinsic::ceil: + case Intrinsic::floor: + case Intrinsic::rint: + case Intrinsic::round: + case Intrinsic::trunc: + break; + default: + return false; + } + LLVM_FALLTHROUGH; // Fall through to treating these like an operator below. + } // Binary/tertiary ops case Instruction::Add: case Instruction::Sub: @@ -210,6 +240,8 @@ Ops.insert(I); for (Use &Op : I->operands()) { + if (!isa(Op->getType())) + continue; if (isa(Op)) Worklist.push_back(cast(&Op)); else @@ -244,7 +276,7 @@ dbgs() << " " << *I << "\n"; dbgs() << " OtherLeafs:"; for (auto *I : OtherLeafs) - dbgs() << " " << *I << "\n"; + dbgs() << " " << *I->get() << " of " << *I->getUser() << "\n"; dbgs() << "Truncs:"; for (auto *I : Truncs) dbgs() << " " << *I << "\n"; diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -631,69 +631,29 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vqadd.s32 q4, q3, q0 -; CHECK-NEXT: vqadd.u32 q4, q4, q0 +; CHECK-NEXT: vmovlb.u16 q2, q1 +; CHECK-NEXT: vmovlb.s16 q3, q0 +; CHECK-NEXT: vqadd.s32 q4, q3, q2 +; CHECK-NEXT: vmovlt.u16 q1, q1 +; CHECK-NEXT: vqadd.u32 q4, q4, q2 +; CHECK-NEXT: vmovlt.s16 q0, q0 ; CHECK-NEXT: vqsub.s32 q4, q4, q3 -; CHECK-NEXT: vqsub.u32 q4, q4, q0 +; CHECK-NEXT: vqsub.u32 q4, q4, q2 ; CHECK-NEXT: vabs.s32 q4, q4 ; CHECK-NEXT: vmin.s32 q4, q4, q3 -; CHECK-NEXT: vmax.s32 q4, q4, q0 +; CHECK-NEXT: vmax.s32 q4, q4, q2 ; CHECK-NEXT: vmin.u32 q3, q4, q3 -; CHECK-NEXT: vmax.u32 q3, q3, q0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmovlb.u16 q1, q2 -; CHECK-NEXT: vmovlb.s16 q2, q3 -; CHECK-NEXT: vqadd.s32 q3, q2, q1 -; CHECK-NEXT: vqadd.u32 q3, q3, q1 -; CHECK-NEXT: vqsub.s32 q3, q3, q2 -; CHECK-NEXT: vqsub.u32 q3, q3, q1 -; CHECK-NEXT: vabs.s32 q3, q3 -; CHECK-NEXT: vmin.s32 q3, q3, q2 -; CHECK-NEXT: vmax.s32 q3, q3, q1 -; CHECK-NEXT: vmin.u32 q2, q3, q2 -; CHECK-NEXT: vmax.u32 q1, q2, q1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vqadd.s32 q4, q0, q1 +; CHECK-NEXT: vqadd.u32 q4, q4, q1 +; CHECK-NEXT: vqsub.s32 q4, q4, q0 +; CHECK-NEXT: vqsub.u32 q4, q4, q1 +; CHECK-NEXT: vabs.s32 q4, q4 +; CHECK-NEXT: vmin.s32 q4, q4, q0 +; CHECK-NEXT: vmax.s32 q4, q4, q1 +; CHECK-NEXT: vmin.u32 q0, q4, q0 +; CHECK-NEXT: vmax.u32 q1, q0, q1 +; CHECK-NEXT: vmax.u32 q0, q3, q2 +; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -717,49 +677,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vcvtt.f32.f16 s3, s5 -; CHECK-NEXT: vcvtt.f32.f16 s15, s9 -; CHECK-NEXT: vcvtb.f32.f16 s2, s5 -; CHECK-NEXT: vcvtb.f32.f16 s14, s9 -; CHECK-NEXT: vcvtt.f32.f16 s1, s4 -; CHECK-NEXT: vcvtt.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s0, s4 -; CHECK-NEXT: vcvtb.f32.f16 s12, s8 -; CHECK-NEXT: vabs.f32 q4, q3 -; CHECK-NEXT: vminnm.f32 q4, q4, q3 -; CHECK-NEXT: vmaxnm.f32 q4, q4, q0 -; CHECK-NEXT: vfma.f32 q0, q4, q3 -; CHECK-NEXT: vcvtt.f32.f16 s19, s11 -; CHECK-NEXT: vrintp.f32 q0, q0 -; CHECK-NEXT: vcvtb.f32.f16 s18, s11 +; CHECK-NEXT: vcvtb.f32.f16 q2, q0 +; CHECK-NEXT: vcvtb.f32.f16 q4, q1 +; CHECK-NEXT: vabs.f32 q3, q2 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vminnm.f32 q3, q3, q2 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 +; CHECK-NEXT: vmaxnm.f32 q3, q3, q4 +; CHECK-NEXT: vfma.f32 q4, q3, q2 +; CHECK-NEXT: vabs.f32 q3, q0 +; CHECK-NEXT: vminnm.f32 q3, q3, q0 +; CHECK-NEXT: vrintp.f32 q2, q4 +; CHECK-NEXT: vmaxnm.f32 q3, q3, q1 +; CHECK-NEXT: vrintm.f32 q2, q2 +; CHECK-NEXT: vfma.f32 q1, q3, q0 +; CHECK-NEXT: vrintx.f32 q2, q2 +; CHECK-NEXT: vrintp.f32 q0, q1 +; CHECK-NEXT: vrinta.f32 q2, q2 ; CHECK-NEXT: vrintm.f32 q0, q0 -; CHECK-NEXT: vcvtt.f32.f16 s17, s10 +; CHECK-NEXT: vrintz.f32 q2, q2 ; CHECK-NEXT: vrintx.f32 q0, q0 -; CHECK-NEXT: vcvtb.f32.f16 s16, s10 ; CHECK-NEXT: vrinta.f32 q0, q0 -; CHECK-NEXT: vrintz.f32 q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s12 -; CHECK-NEXT: vcvtt.f16.f32 s0, s13 -; CHECK-NEXT: vcvtb.f16.f32 s1, s14 -; CHECK-NEXT: vcvtt.f16.f32 s1, s15 -; CHECK-NEXT: vcvtt.f32.f16 s15, s7 -; CHECK-NEXT: vcvtb.f32.f16 s14, s7 -; CHECK-NEXT: vcvtt.f32.f16 s13, s6 -; CHECK-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-NEXT: vabs.f32 q1, q4 -; CHECK-NEXT: vminnm.f32 q1, q1, q4 -; CHECK-NEXT: vmaxnm.f32 q1, q1, q3 -; CHECK-NEXT: vfma.f32 q3, q1, q4 -; CHECK-NEXT: vrintp.f32 q1, q3 -; CHECK-NEXT: vrintm.f32 q1, q1 -; CHECK-NEXT: vrintx.f32 q1, q1 -; CHECK-NEXT: vrinta.f32 q1, q1 -; CHECK-NEXT: vrintz.f32 q1, q1 -; CHECK-NEXT: vcvtb.f16.f32 s2, s4 -; CHECK-NEXT: vcvtt.f16.f32 s2, s5 -; CHECK-NEXT: vcvtb.f16.f32 s3, s6 -; CHECK-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-NEXT: vrintz.f32 q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q0, q2 +; CHECK-NEXT: vcvtt.f16.f32 q0, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: