diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -38,7 +38,8 @@ CMulPartial, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. - Shuffle + Shuffle, + Symmetric }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -254,6 +254,7 @@ /// 270: r: ar + bi /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); + NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); NodePtr identifyNode(Instruction *I, Instruction *J); @@ -651,6 +652,8 @@ ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)RotKey; + LLVM_DEBUG(dbgs() << " - RotKey: " << RotKey << ".\n"); + if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0 || Rotation == llvm::ComplexDeinterleavingRotation::Rotation_180) { LLVM_DEBUG(dbgs() << " - Unsupported rotation.\n"); @@ -703,6 +706,59 @@ return match(A, Pattern) && match(B, Pattern); } +static bool isInstructionPotentiallySymmetric(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FNeg: + return true; + default: + return false; + } +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, + Instruction *Imag) { + if(Real->getOpcode() != Imag->getOpcode()) + return nullptr; + + if (!isInstructionPotentiallySymmetric(Real) || + !isInstructionPotentiallySymmetric(Imag)) + return nullptr; + + auto *R0 = dyn_cast(Real->getOperand(0)); + auto *I0 = dyn_cast(Imag->getOperand(0)); + + if (!R0 || !I0) + return nullptr; + + NodePtr Op0 = identifyNode(R0, I0); + NodePtr Op1 = nullptr; + if (Op0 == nullptr) + return nullptr; + + if (Real->isBinaryOp()) { + auto *R1 = dyn_cast(Real->getOperand(1)); + auto *I1 = dyn_cast(Imag->getOperand(1)); + if (!R1 || !I1) + return nullptr; + + Op1 = identifyNode(R1, I1); + if (Op1 == nullptr) + return nullptr; + } + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, + Real, Imag); + Node->addOperand(Op0); + if (Real->isBinaryOp()) + Node->addOperand(Op1); + + return submitCompositeNode(Node); +} + ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n"); @@ -795,8 +851,10 @@ PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); return submitCompositeNode(PlaceholderNode); } - if (RealShuffle || ImagShuffle) + if (RealShuffle || ImagShuffle) { + LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n"); return nullptr; + } auto *VTy = cast(Real->getType()); auto *NewVTy = @@ -814,7 +872,10 @@ return identifyAdd(Real, Imag); } - return nullptr; + auto Symmetric = identifySymmetricOperation(Real, Imag); + LLVM_DEBUG(if (Symmetric == nullptr) dbgs() + << " - Not recognised as a valid pattern.\n"); + return Symmetric; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { @@ -847,6 +908,31 @@ return RootNode != nullptr; } +static Value* replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, Value* InputA, Value* InputB) { + Instruction* I = Node->Real; + if (I->isUnaryOp()) + assert(!InputB && + "Unary symmetric operations need one input, but two were provided."); + else if (I->isBinaryOp()) + assert(InputB && "Binary symmetric operations need two inputs, only one " + "was provided."); + + IRBuilder<> B(I); + + switch (I->getOpcode()) { + case Instruction::FNeg: + return B.CreateFNeg(InputA); + case Instruction::FAdd: + return B.CreateFAdd(InputA, InputB); + case Instruction::FSub: + return B.CreateFSub(InputA, InputB); + case Instruction::FMul: + return B.CreateFMul(InputA, InputB); + } + + return nullptr; +} + Value *ComplexDeinterleavingGraph::replaceNode( ComplexDeinterleavingGraph::RawNodePtr Node) { if (Node->ReplacementNode) @@ -862,7 +948,10 @@ assert(Input0->getType() == Input1->getType() && "Node inputs need to be of the same type"); - Node->ReplacementNode = TL->createComplexDeinterleavingIR( + if(Node->Operation == ComplexDeinterleavingOperation::Symmetric) + Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1); + else + Node->ReplacementNode = TL->createComplexDeinterleavingIR( Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -366,24 +366,10 @@ define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_addequal: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s -; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s -; CHECK-NEXT: zip2 v4.2s, v2.2s, v7.2s -; CHECK-NEXT: fmul v16.2s, v6.2s, v5.2s -; CHECK-NEXT: fmla v4.2s, v0.2s, v5.2s -; CHECK-NEXT: fneg v3.2s, v16.2s -; CHECK-NEXT: fmla v4.2s, v6.2s, v1.2s -; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v7.2s -; CHECK-NEXT: fadd v0.2s, v3.2s, v0.2s -; CHECK-NEXT: zip2 v1.2s, v0.2s, v4.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fadd v0.4s, v3.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -408,25 +394,10 @@ define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_subequal: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s -; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s -; CHECK-NEXT: fmul v4.2s, v6.2s, v5.2s -; CHECK-NEXT: fmul v3.2s, v5.2s, v0.2s -; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: fneg v4.2s, v4.2s -; CHECK-NEXT: fmla v3.2s, v6.2s, v1.2s -; CHECK-NEXT: fmla v4.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v5.2s -; CHECK-NEXT: zip2 v1.2s, v2.2s, v5.2s -; CHECK-NEXT: fsub v0.2s, v4.2s, v0.2s -; CHECK-NEXT: fsub v1.2s, v3.2s, v1.2s -; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fsub v0.4s, v3.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -452,25 +423,10 @@ define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_mulequal: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s -; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s -; CHECK-NEXT: fmul v4.2s, v6.2s, v5.2s -; CHECK-NEXT: fmul v3.2s, v5.2s, v0.2s -; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: fneg v4.2s, v4.2s -; CHECK-NEXT: fmla v3.2s, v6.2s, v1.2s -; CHECK-NEXT: fmla v4.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v5.2s -; CHECK-NEXT: zip2 v1.2s, v2.2s, v5.2s -; CHECK-NEXT: fmul v0.2s, v4.2s, v0.2s -; CHECK-NEXT: fmul v1.2s, v3.2s, v1.2s -; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fmul v0.4s, v3.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -538,21 +494,10 @@ define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: mul_negequal: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v4.2s, v1.2s, v2.2s -; CHECK-NEXT: zip2 v1.2s, v1.2s, v2.2s -; CHECK-NEXT: zip1 v2.2s, v0.2s, v3.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v3.2s -; CHECK-NEXT: fmul v3.2s, v1.2s, v2.2s -; CHECK-NEXT: fmul v2.2s, v4.2s, v2.2s -; CHECK-NEXT: fmla v3.2s, v0.2s, v4.2s -; CHECK-NEXT: fneg v2.2s, v2.2s -; CHECK-NEXT: fmla v2.2s, v1.2s, v0.2s -; CHECK-NEXT: fneg v0.2s, v3.2s -; CHECK-NEXT: zip2 v1.2s, v2.2s, v0.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v0.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fneg v0.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -569,4 +514,4 @@ %7 = fneg fast <2 x float> %2 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> ret <4 x float> %interleaved.vec -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -390,41 +390,17 @@ define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_addequal: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d12, d13} -; CHECK-NEXT: vpush {d12, d13} -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov d2, r0, r1 -; CHECK-NEXT: add.w r12, sp, #40 -; CHECK-NEXT: add r0, sp, #56 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r12] -; CHECK-NEXT: vmov.f32 s20, s5 -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vmov.f32 s12, s16 -; CHECK-NEXT: vmov.f32 s16, s17 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmul.f32 q6, q5, q4 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vneg.f32 q6, q6 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: vmov.f32 s13, s18 -; CHECK-NEXT: vfma.f32 q0, q4, q1 -; CHECK-NEXT: vmov.f32 s9, s14 -; CHECK-NEXT: vfma.f32 q6, q3, q1 -; CHECK-NEXT: vfma.f32 q0, q3, q5 -; CHECK-NEXT: vadd.f32 q1, q6, q2 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s1 -; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vadd.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: vpop {d12, d13} ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -449,43 +425,17 @@ define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_subequal: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d12, d13} -; CHECK-NEXT: vpush {d12, d13} -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov d2, r0, r1 -; CHECK-NEXT: add r1, sp, #40 -; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: add r0, sp, #56 -; CHECK-NEXT: vmov.f32 s20, s4 -; CHECK-NEXT: vmov.f32 s12, s16 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s16, s17 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmul.f32 q6, q1, q4 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s13, s18 -; CHECK-NEXT: vneg.f32 q6, q6 -; CHECK-NEXT: vmul.f32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vfma.f32 q6, q3, q5 -; CHECK-NEXT: vmov.f32 s1, s10 -; CHECK-NEXT: vfma.f32 q4, q3, q1 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vsub.f32 q0, q6, q0 -; CHECK-NEXT: vsub.f32 q1, q4, q2 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vsub.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: vpop {d12, d13} ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -511,43 +461,17 @@ define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_mulequal: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d12, d13} -; CHECK-NEXT: vpush {d12, d13} -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov d2, r0, r1 -; CHECK-NEXT: add r1, sp, #40 -; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: add r0, sp, #56 -; CHECK-NEXT: vmov.f32 s20, s4 -; CHECK-NEXT: vmov.f32 s12, s16 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s16, s17 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmul.f32 q6, q1, q4 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s13, s18 -; CHECK-NEXT: vneg.f32 q6, q6 -; CHECK-NEXT: vmul.f32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vfma.f32 q6, q3, q5 -; CHECK-NEXT: vmov.f32 s1, s10 -; CHECK-NEXT: vfma.f32 q4, q3, q1 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmul.f32 q0, q6, q0 -; CHECK-NEXT: vmul.f32 q1, q4, q2 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vmul.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: vpop {d12, d13} ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -626,32 +550,15 @@ define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: mul_negequal: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov d0, r0, r1 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s4, s8 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s5, s10 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmul.f32 q4, q2, q0 -; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vfma.f32 q4, q1, q3 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vneg.f32 q1, q4 -; CHECK-NEXT: vfma.f32 q0, q3, q2 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vneg.f32 q0, q2 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32>