diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -143,6 +143,11 @@ Instruction *Real; Instruction *Imag; + // This two members are required exclusively for generating + // ComplexDeinterleavingOperation::Symmetric operations. + unsigned Opcode; + FastMathFlags Flags; + ComplexDeinterleavingRotation Rotation = ComplexDeinterleavingRotation::Rotation_0; SmallVector Operands; @@ -186,8 +191,26 @@ class ComplexDeinterleavingGraph { public: + struct Product { + Instruction *Multiplier; + Instruction *Multiplicand; + bool IsPositive; + }; + + using Addend = std::pair; using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; + + // Helper struct for holding info about potential partial multiplication + // candidates + struct PartialMulCandidate { + Instruction *Common; + NodePtr Node; + unsigned RealIdx; + unsigned ImagIdx; + bool IsNodeInverted; + }; + explicit ComplexDeinterleavingGraph(const TargetLowering *TL, const TargetLibraryInfo *TLI) : TL(TL), TLI(TLI) {} @@ -256,6 +279,40 @@ NodePtr identifyNode(Instruction *I, Instruction *J); + /// Determine if a sum of complex numbers can be formed from \p RealAddends + /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. + /// Return nullptr if it is not possible to construct a complex number. + /// \p Flags are needed to generate symmetric Add and Sub operations. + NodePtr identifyAdditions(std::list &RealAddends, + std::list &ImagAddends, FastMathFlags Flags, + NodePtr Accumulator); + + /// Extract one addend that have both real and imaginary parts positive. + NodePtr extractPositiveAddend(std::list &RealAddends, + std::list &ImagAddends); + + /// Determine if sum of multiplications of complex numbers can be formed from + /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result + /// to it. Return nullptr if it is not possible to construct a complex number. + NodePtr identifyMultiplications(std::vector &RealMuls, + std::vector &ImagMuls, + NodePtr Accumulator); + + /// Go through pairs of multiplication (one Real and one Imag) and find all + /// possible candidates for partial multiplication and put them into \p + /// Candidates. Returns true if all Product has pair with common operand + bool collectPartialMuls(const std::vector &RealMuls, + const std::vector &ImagMuls, + std::vector &Candidates); + + /// If the code is compiled with -Ofast or expressions have `reassoc` flag, + /// the order of complex computation operations may be significantly altered, + /// and the real and imaginary parts may not be executed in parallel. This + /// function takes this into consideration and employs a more general approach + /// to identify complex computations. Initially, it gathers all the addends + /// and multiplicands and then constructs a complex expression from them. + NodePtr identifyReassocNodes(Instruction *I, Instruction *J); + NodePtr identifyRoot(Instruction *I); /// Identifies the Deinterleave operation applied to a vector containing @@ -737,8 +794,16 @@ return nullptr; } + if (isa(Real) && + Real->getFastMathFlags() != Imag->getFastMathFlags()) + return nullptr; + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Real, Imag); + Node->Opcode = Real->getOpcode(); + if (isa(Real)) + Node->Flags = Real->getFastMathFlags(); + Node->addOperand(Op0); if (Real->isBinaryOp()) Node->addOperand(Op1); @@ -754,29 +819,477 @@ return CN; } - NodePtr Node = identifyDeinterleave(Real, Imag); - if (Node) - return Node; + if (NodePtr CN = identifyDeinterleave(Real, Imag)) + return CN; auto *VTy = cast(Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CMulPartial, NewVTy) && - isInstructionPairMul(Real, Imag)) { - return identifyPartialMul(Real, Imag); + bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy); + bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy); + + if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { + if (NodePtr CN = identifyPartialMul(Real, Imag)) + return CN; + } + + if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { + if (NodePtr CN = identifyAdd(Real, Imag)) + return CN; + } + + if (HasCMulSupport && HasCAddSupport) { + if (NodePtr CN = identifyReassocNodes(Real, Imag)) + return CN; + } + + if (NodePtr CN = identifySymmetricOperation(Real, Imag)) + return CN; + + LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n"); + return nullptr; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, + Instruction *Imag) { + if ((Real->getOpcode() != Instruction::FAdd && + Real->getOpcode() != Instruction::FSub && + Real->getOpcode() != Instruction::FNeg) || + (Imag->getOpcode() != Instruction::FAdd && + Imag->getOpcode() != Instruction::FSub && + Imag->getOpcode() != Instruction::FNeg)) + return nullptr; + + if (Real->getFastMathFlags() != Imag->getFastMathFlags()) { + LLVM_DEBUG( + dbgs() + << "The flags in Real and Imaginary instructions are not identical\n"); + return nullptr; + } + + FastMathFlags Flags = Real->getFastMathFlags(); + if (!Flags.allowReassoc()) { + LLVM_DEBUG( + dbgs() << "the 'Reassoc' attribute is missing in the FastMath flags\n"); + return nullptr; + } + + // Collect multiplications and addend instructions from the given instruction + // while traversing it operands. Additionally, verify that all instructions + // have the same fast math flags. + auto Collect = [&Flags](Instruction *Insn, std::vector &Muls, + std::list &Addends) -> bool { + SmallVector> Worklist = {{Insn, true}}; + SmallPtrSet Visited; + while (!Worklist.empty()) { + auto [V, IsPositive] = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(V).second) + continue; + + Instruction *I = dyn_cast(V); + if (!I) + return false; + + // If an instruction has more than one user, it indicates that it either + // has an external user, which will be later checked by the checkNodes + // function, or it is a subexpression utilized by multiple expressions. In + // the latter case, we will attempt to separately identify the complex + // operation from here in order to create a shared + // ComplexDeinterleavingCompositeNode. + if (I != Insn && I->getNumUses() > 1) { + LLVM_DEBUG(dbgs() << "Found potential sub-expression: " << *I << "\n"); + Addends.emplace_back(I, IsPositive); + continue; + } + + if (I->getOpcode() == Instruction::FAdd) { + Worklist.emplace_back(I->getOperand(1), IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } else if (I->getOpcode() == Instruction::FSub) { + Worklist.emplace_back(I->getOperand(1), !IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } else if (I->getOpcode() == Instruction::FMul) { + auto *A = dyn_cast(I->getOperand(0)); + if (A && A->getOpcode() == Instruction::FNeg) { + A = dyn_cast(A->getOperand(0)); + IsPositive = !IsPositive; + } + if (!A) + return false; + auto *B = dyn_cast(I->getOperand(1)); + if (B && B->getOpcode() == Instruction::FNeg) { + B = dyn_cast(B->getOperand(0)); + IsPositive = !IsPositive; + } + if (!B) + return false; + Muls.push_back(Product{A, B, IsPositive}); + } else if (I->getOpcode() == Instruction::FNeg) { + Worklist.emplace_back(I->getOperand(0), !IsPositive); + } else { + Addends.emplace_back(I, IsPositive); + continue; + } + + if (I->getFastMathFlags() != Flags) { + LLVM_DEBUG(dbgs() << "The instruction's fast math flags are " + "inconsistent with the root instructions' flags: " + << *I << "\n"); + return false; + } + } + return true; + }; + + std::vector RealMuls, ImagMuls; + std::list RealAddends, ImagAddends; + if (!Collect(Real, RealMuls, RealAddends) || + !Collect(Imag, ImagMuls, ImagAddends)) + return nullptr; + + if (RealAddends.size() != ImagAddends.size()) + return nullptr; + + NodePtr FinalNode; + if (!RealMuls.empty() || !ImagMuls.empty()) { + // If there are multiplicands, extract positive addend and use it as an + // accumulator + FinalNode = extractPositiveAddend(RealAddends, ImagAddends); + FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode); + if (!FinalNode) + return nullptr; } - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CAdd, NewVTy) && - isInstructionPairAdd(Real, Imag)) { - return identifyAdd(Real, Imag); + // Identify and process remaining additions + if (!RealAddends.empty() || !ImagAddends.empty()) { + FinalNode = identifyAdditions(RealAddends, ImagAddends, Flags, FinalNode); + if (!FinalNode) + return nullptr; } - auto Symmetric = identifySymmetricOperation(Real, Imag); - LLVM_DEBUG(if (Symmetric == nullptr) dbgs() - << " - Not recognised as a valid pattern.\n"); - return Symmetric; + // Set the Real and Imag fields of the final node and submit it + FinalNode->Real = Real; + FinalNode->Imag = Imag; + submitCompositeNode(FinalNode); + return FinalNode; +} + +bool ComplexDeinterleavingGraph::collectPartialMuls( + const std::vector &RealMuls, const std::vector &ImagMuls, + std::vector &PartialMulCandidates) { + // Helper function to extract a common operand from two products + auto FindCommonInstruction = [](const Product &Real, + const Product &Imag) -> Instruction * { + if (Real.Multiplicand == Imag.Multiplicand || + Real.Multiplicand == Imag.Multiplier) + return Real.Multiplicand; + + if (Real.Multiplier == Imag.Multiplicand || + Real.Multiplier == Imag.Multiplier) + return Real.Multiplier; + + return nullptr; + }; + + // Iterating over real and imaginary multiplications to find common operands + // If a common operand is found, a partial multiplication candidate is created + // and added to the candidates vector The function returns false if no common + // operands are found for any product + for (unsigned i = 0; i < RealMuls.size(); ++i) { + bool FoundCommon = false; + for (unsigned j = 0; j < ImagMuls.size(); ++j) { + auto *Common = FindCommonInstruction(RealMuls[i], ImagMuls[j]); + if (!Common) + continue; + + auto *A = RealMuls[i].Multiplicand == Common ? RealMuls[i].Multiplier + : RealMuls[i].Multiplicand; + auto *B = ImagMuls[j].Multiplicand == Common ? ImagMuls[j].Multiplier + : ImagMuls[j].Multiplicand; + + bool Inverted = false; + auto Node = identifyNode(A, B); + if (!Node) { + std::swap(A, B); + Inverted = true; + Node = identifyNode(A, B); + } + if (!Node) + continue; + + FoundCommon = true; + PartialMulCandidates.push_back({Common, Node, i, j, Inverted}); + } + if (!FoundCommon) + return false; + } + return true; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyMultiplications( + std::vector &RealMuls, std::vector &ImagMuls, + NodePtr Accumulator = nullptr) { + if (RealMuls.size() != ImagMuls.size()) + return nullptr; + + std::vector Info; + if (!collectPartialMuls(RealMuls, ImagMuls, Info)) + return nullptr; + + // Map to store common instruction to node pointers + std::map CommonToNode; + std::vector Processed(Info.size(), false); + for (unsigned I = 0; I < Info.size(); ++I) { + if (Processed[I]) + continue; + + PartialMulCandidate &InfoA = Info[I]; + for (unsigned J = I + 1; J < Info.size(); ++J) { + if (Processed[J]) + continue; + + PartialMulCandidate &InfoB = Info[J]; + auto *InfoReal = &InfoA; + auto *InfoImag = &InfoB; + + auto NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); + if (!NodeFromCommon) { + std::swap(InfoReal, InfoImag); + NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); + } + if (!NodeFromCommon) + continue; + + CommonToNode[InfoReal->Common] = NodeFromCommon; + CommonToNode[InfoImag->Common] = NodeFromCommon; + Processed[I] = true; + Processed[J] = true; + } + } + + std::vector ProcessedReal(RealMuls.size(), false); + std::vector ProcessedImag(ImagMuls.size(), false); + NodePtr Result = Accumulator; + for (auto &PMI : Info) { + if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx]) + continue; + + auto It = CommonToNode.find(PMI.Common); + // TODO: Process independent complex multiplications. Cases like this: + // A.real() * B where both A and B are complex numbers. + if (It == CommonToNode.end()) { + LLVM_DEBUG({ + dbgs() << "Unprocessed independent partial multiplication:\n"; + for (auto *Mul : {&RealMuls[PMI.RealIdx], &RealMuls[PMI.RealIdx]}) + dbgs().indent(4) << (Mul->IsPositive ? "+" : "-") << *Mul->Multiplier + << " multiplied by " << *Mul->Multiplicand << "\n"; + }); + return nullptr; + } + + auto &RealMul = RealMuls[PMI.RealIdx]; + auto &ImagMul = ImagMuls[PMI.ImagIdx]; + + auto NodeA = It->second; + auto NodeB = PMI.Node; + auto IsMultiplicandReal = PMI.Common == NodeA->Real; + // The following table illustrates the relationship between multiplications + // and rotations. If we consider the multiplication (X + iY) * (U + iV), we + // can see: + // + // Rotation | Real | Imag | + // ---------+--------+--------+ + // 0 | x * u | x * v | + // 90 | -y * v | y * u | + // 180 | -x * u | -x * v | + // 270 | y * v | -y * u | + // + // Check if the candidate can indeed be represented by partial + // multiplication + // TODO: Add support for multiplication by complex one + if ((IsMultiplicandReal && PMI.IsNodeInverted) || + (!IsMultiplicandReal && !PMI.IsNodeInverted)) + continue; + + // Determine the rotation based on the multiplications + ComplexDeinterleavingRotation Rotation; + if (IsMultiplicandReal) { + // Detect 0 and 180 degrees rotation + if (RealMul.IsPositive && ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_0; + else if (!RealMul.IsPositive && !ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_180; + else + continue; + + } else { + // Detect 90 and 270 degrees rotation + if (!RealMul.IsPositive && ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_90; + else if (RealMul.IsPositive && !ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_270; + else + continue; + } + + LLVM_DEBUG({ + dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n"; + dbgs().indent(4) << "X: " << *NodeA->Real << "\n"; + dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n"; + dbgs().indent(4) << "U: " << *NodeB->Real << "\n"; + dbgs().indent(4) << "V: " << *NodeB->Imag << "\n"; + dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; + }); + + NodePtr NodeMul = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr); + NodeMul->Rotation = Rotation; + NodeMul->addOperand(NodeA); + NodeMul->addOperand(NodeB); + if (Result) + NodeMul->addOperand(Result); + submitCompositeNode(NodeMul); + Result = NodeMul; + ProcessedReal[PMI.RealIdx] = true; + ProcessedImag[PMI.ImagIdx] = true; + } + + // Ensure all products have been processed, if not return nullptr. + if (!all_of(ProcessedReal, [](bool V) { return V; }) || + !all_of(ProcessedImag, [](bool V) { return V; })) { + + // Dump debug information about which partial multiplications are not + // processed. + LLVM_DEBUG({ + dbgs() << "Unprocessed products (Real):\n"; + for (size_t i = 0; i < ProcessedReal.size(); ++i) { + if (!ProcessedReal[i]) + dbgs().indent(4) << (RealMuls[i].IsPositive ? "+" : "-") + << *RealMuls[i].Multiplier << " multiplied by " + << *RealMuls[i].Multiplicand << "\n"; + } + dbgs() << "Unprocessed products (Imag):\n"; + for (size_t i = 0; i < ProcessedImag.size(); ++i) { + if (!ProcessedImag[i]) + dbgs().indent(4) << (ImagMuls[i].IsPositive ? "+" : "-") + << *ImagMuls[i].Multiplier << " multiplied by " + << *ImagMuls[i].Multiplicand << "\n"; + } + }); + return nullptr; + } + + return Result; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdditions(std::list &RealAddends, + std::list &ImagAddends, + FastMathFlags Flags, + NodePtr Accumulator = nullptr) { + if (RealAddends.size() != ImagAddends.size()) + return nullptr; + + NodePtr Result; + // If we have accumulator use it as first addend + if (Accumulator) + Result = Accumulator; + // Otherwise find an element with both positive real and imaginary parts. + else + Result = extractPositiveAddend(RealAddends, ImagAddends); + + if (!Result) + return nullptr; + + while (!RealAddends.empty()) { + auto ItR = RealAddends.begin(); + auto [R, IsPositiveR] = *ItR; + + bool FoundImag = false; + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [I, IsPositiveI] = *ItI; + ComplexDeinterleavingRotation Rotation; + if (IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_0; + else if (!IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (!IsPositiveR && !IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_180; + else + Rotation = ComplexDeinterleavingRotation::Rotation_270; + + NodePtr AddNode; + if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) { + AddNode = identifyNode(R, I); + } else { + AddNode = identifyNode(I, R); + } + if (AddNode) { + LLVM_DEBUG({ + dbgs() << "Identified addition:\n"; + dbgs().indent(4) << "X: " << *R << "\n"; + dbgs().indent(4) << "Y: " << *I << "\n"; + dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; + }); + + NodePtr TmpNode; + if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) { + TmpNode = prepareCompositeNode( + ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); + TmpNode->Opcode = Instruction::FAdd; + TmpNode->Flags = Flags; + } else if (Rotation == + llvm::ComplexDeinterleavingRotation::Rotation_180) { + TmpNode = prepareCompositeNode( + ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); + TmpNode->Opcode = Instruction::FSub; + TmpNode->Flags = Flags; + } else { + TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, + nullptr, nullptr); + TmpNode->Rotation = Rotation; + } + + TmpNode->addOperand(Result); + TmpNode->addOperand(AddNode); + submitCompositeNode(TmpNode); + Result = TmpNode; + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + FoundImag = true; + break; + } + } + if (!FoundImag) + return nullptr; + } + return Result; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::extractPositiveAddend( + std::list &RealAddends, std::list &ImagAddends) { + for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [R, IsPositiveR] = *ItR; + auto [I, IsPositiveI] = *ItI; + if (IsPositiveR && IsPositiveI) { + auto Result = identifyNode(R, I); + if (Result) { + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + return Result; + } + } + } + } + return nullptr; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { @@ -1011,29 +1524,28 @@ return submitCompositeNode(PlaceholderNode); } -static Value *replaceSymmetricNode(IRBuilderBase &B, - ComplexDeinterleavingGraph::RawNodePtr Node, - Value *InputA, Value *InputB) { - Instruction *I = Node->Real; - if (I->isUnaryOp()) - assert(!InputB && - "Unary symmetric operations need one input, but two were provided."); - else if (I->isBinaryOp()) - assert(InputB && "Binary symmetric operations need two inputs, only one " - "was provided."); - - switch (I->getOpcode()) { +static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode, + FastMathFlags Flags, Value *InputA, + Value *InputB) { + Value *I; + switch (Opcode) { case Instruction::FNeg: - return B.CreateFNegFMF(InputA, I); + I = B.CreateFNeg(InputA); + break; case Instruction::FAdd: - return B.CreateFAddFMF(InputA, InputB, I); + I = B.CreateFAdd(InputA, InputB); + break; case Instruction::FSub: - return B.CreateFSubFMF(InputA, InputB, I); + I = B.CreateFSub(InputA, InputB); + break; case Instruction::FMul: - return B.CreateFMulFMF(InputA, InputB, I); + I = B.CreateFMul(InputA, InputB); + break; + default: + llvm_unreachable("Incorrect symmetric opcode"); } - - return nullptr; + cast(I)->setFastMathFlags(Flags); + return I; } Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, @@ -1048,13 +1560,13 @@ Value *Accumulator = Node->Operands.size() > 2 ? replaceNode(Builder, Node->Operands[2]) : nullptr; - if (Input1) assert(Input0->getType() == Input1->getType() && "Node inputs need to be of the same type"); if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) - Node->ReplacementNode = replaceSymmetricNode(Builder, Node, Input0, Input1); + Node->ReplacementNode = replaceSymmetricNode(Builder, Node->Opcode, + Node->Flags, Input0, Input1); else Node->ReplacementNode = TL->createComplexDeinterleavingIR( Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll @@ -7,18 +7,12 @@ define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d -; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d -; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d -; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d +; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -43,25 +37,18 @@ define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d -; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d -; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d -; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d -; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d -; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d -; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -94,26 +81,18 @@ define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d -; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d -; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d -; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d -; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d -; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d -; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d -; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d -; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #270 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #270 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #180 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #180 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -146,25 +125,18 @@ define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d -; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d -; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d -; CHECK-NEXT: fneg v3.2d, v3.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d -; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d -; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d -; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d -; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d -; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d -; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #270 +; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #270 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -7,21 +7,13 @@ define @mull_add( %a, %b, %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d -; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: movprfx z5, z6 -; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d -; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d -; CHECK-NEXT: zip1 z0.d, z1.d, z3.d -; CHECK-NEXT: zip2 z1.d, z1.d, z3.d +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -49,26 +41,19 @@ define @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d -; CHECK-NEXT: fmul z1.d, z1.d, z25.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d -; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d -; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d -; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fsub z1.d, z1.d, z0.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -105,27 +90,19 @@ define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d -; CHECK-NEXT: fmul z1.d, z1.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d -; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d -; CHECK-NEXT: fmul z3.d, z5.d, z3.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: fsub z1.d, z1.d, z0.d -; CHECK-NEXT: fsub z2.d, z2.d, z3.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -162,26 +139,19 @@ define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d -; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d -; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d -; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d -; CHECK-NEXT: zip1 z0.d, z3.d, z1.d -; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -484,9 +484,9 @@ ; CHECK-LABEL: mul_negequal: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 -; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 -; CHECK-NEXT: fneg v0.4s, v2.4s +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #180 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #270 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll @@ -299,50 +299,34 @@ define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) { ; CHECK-LABEL: mul_add_common_mul_add_mul: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q17, q16, [sp, #96] -; CHECK-NEXT: zip2 v20.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v21.2d, v6.2d, v7.2d -; CHECK-NEXT: zip1 v4.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: ldp q19, q18, [sp, #64] -; CHECK-NEXT: zip2 v23.2d, v17.2d, v16.2d -; CHECK-NEXT: fmul v6.2d, v21.2d, v20.2d -; CHECK-NEXT: zip1 v16.2d, v17.2d, v16.2d -; CHECK-NEXT: zip2 v22.2d, v19.2d, v18.2d -; CHECK-NEXT: zip1 v18.2d, v19.2d, v18.2d -; CHECK-NEXT: fneg v6.2d, v6.2d -; CHECK-NEXT: fmul v20.2d, v5.2d, v20.2d -; CHECK-NEXT: fmul v7.2d, v22.2d, v23.2d -; CHECK-NEXT: fmla v6.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v5.2d, v2.2d, v3.2d -; CHECK-NEXT: fneg v7.2d, v7.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmla v7.2d, v18.2d, v16.2d -; CHECK-NEXT: fadd v19.2d, v7.2d, v6.2d -; CHECK-NEXT: fmla v20.2d, v4.2d, v21.2d -; CHECK-NEXT: zip2 v4.2d, v0.2d, v1.2d -; CHECK-NEXT: ldp q7, q6, [sp] -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: fmla v20.2d, v18.2d, v23.2d -; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d -; CHECK-NEXT: fmla v20.2d, v22.2d, v16.2d -; CHECK-NEXT: mov v3.16b, v19.16b -; CHECK-NEXT: fmla v1.2d, v0.2d, v5.2d -; CHECK-NEXT: fmla v3.2d, v4.2d, v5.2d -; CHECK-NEXT: ldp q16, q4, [sp, #32] -; CHECK-NEXT: fneg v17.2d, v3.2d -; CHECK-NEXT: zip1 v3.2d, v7.2d, v6.2d -; CHECK-NEXT: zip2 v6.2d, v7.2d, v6.2d -; CHECK-NEXT: zip1 v5.2d, v16.2d, v4.2d -; CHECK-NEXT: fmla v17.2d, v0.2d, v2.2d -; CHECK-NEXT: fsub v18.2d, v1.2d, v20.2d -; CHECK-NEXT: zip2 v0.2d, v16.2d, v4.2d -; CHECK-NEXT: fmla v19.2d, v3.2d, v5.2d -; CHECK-NEXT: st2 { v17.2d, v18.2d }, [x0] -; CHECK-NEXT: fmls v19.2d, v6.2d, v0.2d -; CHECK-NEXT: fmla v20.2d, v6.2d, v5.2d -; CHECK-NEXT: fmla v20.2d, v3.2d, v0.2d -; CHECK-NEXT: st2 { v19.2d, v20.2d }, [x1] +; CHECK-NEXT: ldp q17, q16, [sp, #64] +; CHECK-NEXT: movi v20.2d, #0000000000000000 +; CHECK-NEXT: movi v21.2d, #0000000000000000 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: ldp q19, q18, [sp, #96] +; CHECK-NEXT: fcmla v24.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v25.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v20.2d, v19.2d, v17.2d, #0 +; CHECK-NEXT: fcmla v24.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v21.2d, v18.2d, v16.2d, #0 +; CHECK-NEXT: ldp q23, q22, [sp, #32] +; CHECK-NEXT: fcmla v20.2d, v19.2d, v17.2d, #90 +; CHECK-NEXT: fcmla v25.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: fcmla v21.2d, v18.2d, v16.2d, #90 +; CHECK-NEXT: fcmla v20.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: fcmla v21.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v20.2d, v6.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v21.2d, v7.2d, v5.2d, #90 +; CHECK-NEXT: fsub v2.2d, v24.2d, v20.2d +; CHECK-NEXT: fcmla v20.2d, v1.2d, v23.2d, #0 +; CHECK-NEXT: fsub v3.2d, v25.2d, v21.2d +; CHECK-NEXT: fcmla v21.2d, v0.2d, v22.2d, #0 +; CHECK-NEXT: fcmla v20.2d, v1.2d, v23.2d, #90 +; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: fcmla v21.2d, v0.2d, v22.2d, #90 +; CHECK-NEXT: stp q20, q21, [x1] ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -115,15 +115,7 @@ define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_add_270_false: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s -; CHECK-NEXT: fsub v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fcadd v0.4s, v0.4s, v1.4s, #270 ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -553,11 +553,10 @@ ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 -; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 -; CHECK-NEXT: vneg.f32 q0, q2 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #180 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #270 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -118,19 +118,8 @@ define arm_aapcs_vfpcc <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_add_270_false: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vsub.f32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vadd.f32 q1, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vcadd.f32 q2, q0, q1, #270 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32>