diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -3165,7 +3166,7 @@
   /// If one cannot be created using all the given inputs, nullptr should be
   /// returned.
   virtual Value *createComplexDeinterleavingIR(
-      Instruction *I, ComplexDeinterleavingOperation OperationType,
+      IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
       ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
       Value *Accumulator = nullptr) const {
     return nullptr;
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -184,6 +184,12 @@
 
 class ComplexDeinterleavingGraph {
 public:
+  struct Product {
+    Instruction *Multiplier;
+    Instruction *Multiplicand;
+    bool IsPositive;
+  };
+  using Addend = std::pair<Instruction *, bool>;
   using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr;
   using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr;
   explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
@@ -254,6 +260,45 @@
 
   NodePtr identifyNode(Instruction *I, Instruction *J);
 
+  /// Determine if a sum of complex numbers can be formed from \p RealAddends
+  /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
+  /// Return nullptr if it is not possible to construct a complex number.
+  NodePtr identifyAdditions(std::list<Addend> &RealAddends,
+                            std::list<Addend> &ImagAddends,
+                            NodePtr Accumulator);
+
+  /// Extract one addend that have both real and imaginary parts positive.
+  NodePtr extractPositiveAddend(std::list<Addend> &RealAddends,
+                                std::list<Addend> &ImagAddends);
+
+  /// Determine if sum of multiplications of complex numbers can be formed from
+  /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result
+  /// to it. Return nullptr if it is not possible to construct a complex number.
+  NodePtr identifyMultiplications(std::list<Product> &RealMuls,
+                                  std::list<Product> &ImagMuls,
+                                  NodePtr Accumulator);
+
+  /// This function attempts to locate four products that correspond to a single
+  /// complex multiplication. Upon success, it returns two NodePtrs representing
+  /// the complex numbers being multiplied, as well as the rotations for FCMLA
+  /// multiplication. Additionally, it removes the products from RealMuls and
+  /// ImagMuls
+  /// TODO: This code may not be able to detect multiplications that involve an
+  /// additional 90-degree rotation, however it can be implemented at a later
+  /// time by extending RotationMap
+  std::pair<NodePtr, NodePtr>
+  extractOneMul(std::list<Product> &RealMuls, std::list<Product> &ImagMuls,
+                ComplexDeinterleavingRotation &Rotation1,
+                ComplexDeinterleavingRotation &Rotation2);
+
+  /// If the code is compiled with -Ofast or expressions have `reassoc` flag,
+  /// the order of complex computation operations may be significantly altered,
+  /// and the real and imaginary parts may not be executed in parallel. This
+  /// function takes this into consideration and employs a more general approach
+  /// to identify complex computations. Initially, it gathers all the addends
+  /// and multiplicands and then constructs a complex expression from them.
+  NodePtr identifyReassocNodes(Instruction *I, Instruction *J);
+
   NodePtr identifyRoot(Instruction *I);
 
   /// Identifies the Deinterleave operation applied to a vector containing
@@ -265,7 +310,8 @@
   /// intrinsic (for both fixed and scalable vectors)
   NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
 
-  Value *replaceNode(RawNodePtr Node);
+  Value *replaceNode(IRBuilderBase &IRB,
+                     ComplexDeinterleavingGraph::RawNodePtr Node);
 
 public:
   void dump() { dump(dbgs()); }
@@ -759,22 +805,377 @@
   auto *VTy = cast<VectorType>(Real->getType());
   auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
 
-  if (TL->isComplexDeinterleavingOperationSupported(
-          ComplexDeinterleavingOperation::CMulPartial, NewVTy) &&
-      isInstructionPairMul(Real, Imag)) {
-    return identifyPartialMul(Real, Imag);
+  bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+  bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation::CAdd, NewVTy);
+
+  if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+    Node = identifyPartialMul(Real, Imag);
+    if (Node)
+      return Node;
+  }
+
+  if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+    Node = identifyAdd(Real, Imag);
+    if (Node)
+      return Node;
+  }
+
+  if (HasCMulSupport && HasCAddSupport) {
+    Node = identifyReassocNodes(Real, Imag);
+    if (Node)
+      return Node;
+  }
+
+  Node = identifySymmetricOperation(Real, Imag);
+  if (Node)
+    return Node;
+
+  LLVM_DEBUG(dbgs() << "  - Not recognised as a valid pattern.\n");
+  return nullptr;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
+                                                 Instruction *Imag) {
+  if ((Real->getOpcode() != Instruction::FAdd &&
+       Real->getOpcode() != Instruction::FSub &&
+       Real->getOpcode() != Instruction::FNeg) ||
+      (Imag->getOpcode() != Instruction::FAdd &&
+       Imag->getOpcode() != Instruction::FSub &&
+       Imag->getOpcode() != Instruction::FNeg))
+    return nullptr;
+
+  // Collect muls and non-muls
+  auto Collect = [](Instruction *Insn, std::list<Product> &Muls,
+                    std::list<Addend> &Addends) -> bool {
+    SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}};
+    SmallPtrSet<Value *, 8> Visited;
+    while (!Worklist.empty()) {
+      auto [V, IsPositive] = Worklist.back();
+      Worklist.pop_back();
+      if (!Visited.insert(V).second)
+        continue;
+
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I)
+        return false;
+
+      if (I->getOpcode() == Instruction::FAdd) {
+        Worklist.emplace_back(I->getOperand(1), IsPositive);
+        Worklist.emplace_back(I->getOperand(0), IsPositive);
+      } else if (I->getOpcode() == Instruction::FSub) {
+        Worklist.emplace_back(I->getOperand(1), IsPositive ^ true);
+        Worklist.emplace_back(I->getOperand(0), IsPositive);
+      } else if (I->getOpcode() == Instruction::FMul) {
+        auto *A = dyn_cast<Instruction>(I->getOperand(0));
+        if (A && A->getOpcode() == Instruction::FNeg) {
+          A = dyn_cast<Instruction>(A->getOperand(0));
+          IsPositive ^= true;
+        }
+        if (!A)
+          return false;
+        auto *B = dyn_cast<Instruction>(I->getOperand(1));
+        if (B && B->getOpcode() == Instruction::FNeg) {
+          B = dyn_cast<Instruction>(B->getOperand(0));
+          IsPositive ^= true;
+        }
+        if (!B)
+          return false;
+        Muls.push_back(Product{A, B, IsPositive});
+      } else if (I->getOpcode() == Instruction::FNeg) {
+        Worklist.emplace_back(I->getOperand(0), IsPositive ^ true);
+      } else {
+        Addends.emplace_back(I, IsPositive);
+        continue;
+      }
+
+      if (!I->getFastMathFlags().allowReassoc()) {
+        LLVM_DEBUG(dbgs() << "Reassoc is missing from the FastMath flags: "
+                          << *I << "\n");
+        return false;
+      }
+    }
+    return true;
+  };
+
+  std::list<Product> RealMuls, ImagMuls;
+  std::list<Addend> RealAddends, ImagAddends;
+  if (!Collect(Real, RealMuls, RealAddends) ||
+      !Collect(Imag, ImagMuls, ImagAddends))
+    return nullptr;
+
+  if (RealAddends.size() != ImagAddends.size())
+    return nullptr;
+
+  NodePtr FinalNode;
+  if (!RealMuls.empty() || !ImagMuls.empty()) {
+    // If there are multiplicands, first try to extract positive addend and use
+    // it as an accumulator
+    FinalNode = extractPositiveAddend(RealAddends, ImagAddends);
+    FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode);
+    if (!FinalNode)
+      return nullptr;
+  }
+
+  if (!RealAddends.empty() || !ImagAddends.empty()) {
+    FinalNode = identifyAdditions(RealAddends, ImagAddends, FinalNode);
+    if (!FinalNode)
+      return nullptr;
+  }
+  FinalNode->Real = Real;
+  FinalNode->Imag = Imag;
+  submitCompositeNode(FinalNode);
+  return FinalNode;
+}
+
+std::pair<ComplexDeinterleavingGraph::NodePtr,
+          ComplexDeinterleavingGraph::NodePtr>
+ComplexDeinterleavingGraph::extractOneMul(
+    std::list<Product> &RealMuls, std::list<Product> &ImagMuls,
+    ComplexDeinterleavingRotation &Rotation1,
+    ComplexDeinterleavingRotation &Rotation2) {
+  auto ExtractCommon = [](Product &Real, Product &Imag) -> Instruction * {
+    if (Real.Multiplicand == Imag.Multiplicand ||
+        Real.Multiplicand == Imag.Multiplier)
+      return Real.Multiplicand;
+
+    if (Real.Multiplier == Imag.Multiplicand ||
+        Real.Multiplier == Imag.Multiplier)
+      return Real.Multiplier;
+
+    return nullptr;
+  };
+
+  // The following table illustrates the relationship between multiplications
+  // and rotations. If we consider the multiplication (X + iY) * (U + iV), we
+  // can see:
+  //
+  // Rotation |   Real |   Imag |
+  // ---------+--------+--------+
+  //        0 |  x * u |  x * v |
+  //       90 | -y * v |  y * u |
+  //      180 | -x * u | -x * v |
+  //      270 |  y * v | -y * u |
+  //
+  // This code will attempt to identify four multiplications that share common
+  // operands, and can be represented by two of the expressions in the table
+  // above. The rotations will be determined, and it will be verified if the
+  // identified operands correspond to two complex numbers
+  NodePtr NodeA, NodeB;
+  Instruction *X, *Y, *U, *V;
+  auto ItR1 = RealMuls.begin();
+  for (auto ItI1 = ImagMuls.begin(); ItI1 != ImagMuls.end(); ++ItI1) {
+    X = ExtractCommon(*ItR1, *ItI1);
+    if (!X)
+      continue;
+
+    U = ItR1->Multiplicand == X ? ItR1->Multiplier : ItR1->Multiplicand;
+    V = ItI1->Multiplicand == X ? ItI1->Multiplier : ItI1->Multiplicand;
+
+    for (auto ItR2 = std::next(ItR1); ItR2 != RealMuls.end(); ++ItR2) {
+      auto *Tmp = ExtractCommon(*ItR2, *ItI1);
+      if (!Tmp || Tmp != V)
+        continue;
+
+      Y = ItR2->Multiplicand == V ? ItR2->Multiplier : ItR2->Multiplicand;
+
+      for (auto ItI2 = ImagMuls.begin(); ItI2 != ImagMuls.end(); ++ItI2) {
+        if (ItI1 == ItI2)
+          continue;
+
+        if ((ItI2->Multiplier != U && ItI2->Multiplicand != U) ||
+            (ItI2->Multiplier != Y && ItI2->Multiplicand != Y))
+          continue;
+
+        LLVM_DEBUG({
+          dbgs() << "Found potential complex multiplication:\n";
+          for (auto It : {ItR1, ItR2, ItI1, ItI2})
+            dbgs().indent(4) << (It->IsPositive ? "+" : "-") << *It->Multiplier
+                             << " multiplied by " << *It->Multiplicand << "\n";
+        });
+
+        // We detect only pairs of multiplication with 0 and 90 or 180 and 270
+        static const ComplexDeinterleavingRotation RotationMap[2][2] = {
+            {ComplexDeinterleavingRotation::Rotation_180,
+             ComplexDeinterleavingRotation::Rotation_90},
+            {ComplexDeinterleavingRotation::Rotation_270,
+             ComplexDeinterleavingRotation::Rotation_0}};
+
+        Rotation1 = RotationMap[ItR1->IsPositive][ItI1->IsPositive];
+        Rotation2 = RotationMap[ItR2->IsPositive][ItI2->IsPositive];
+
+        if (Rotation1 == ComplexDeinterleavingRotation::Rotation_0 ||
+            Rotation1 == ComplexDeinterleavingRotation::Rotation_180) {
+
+          if (Rotation2 != ComplexDeinterleavingRotation::Rotation_90 &&
+              Rotation2 != ComplexDeinterleavingRotation::Rotation_270)
+            continue;
+
+        } else {
+          if (Rotation2 != ComplexDeinterleavingRotation::Rotation_0 &&
+              Rotation2 != ComplexDeinterleavingRotation::Rotation_180)
+            continue;
+
+          std::swap(X, Y);
+          std::swap(U, V);
+        }
+
+        NodeA = identifyNode(X, Y);
+        if (!NodeA)
+          continue;
+        NodeB = identifyNode(U, V);
+        if (!NodeB)
+          continue;
+
+        LLVM_DEBUG({
+          dbgs() << "Identified multiplication (X, Y) * (U, V):\n";
+          dbgs().indent(4) << "X: " << *X << "\n";
+          dbgs().indent(4) << "Y: " << *Y << "\n";
+          dbgs().indent(4) << "U: " << *U << "\n";
+          dbgs().indent(4) << "V: " << *V << "\n";
+          dbgs().indent(4) << "Rotation #1 - " << (int)Rotation1 * 90
+                           << ", Rotation #2 - " << (int)Rotation2 * 90 << "\n";
+        });
+        RealMuls.erase(ItR1);
+        RealMuls.erase(ItR2);
+        ImagMuls.erase(ItI1);
+        ImagMuls.erase(ItI2);
+        return std::make_pair(NodeA, NodeB);
+      }
+    }
   }
+  return std::make_pair(NodeA, NodeB);
+}
 
-  if (TL->isComplexDeinterleavingOperationSupported(
-          ComplexDeinterleavingOperation::CAdd, NewVTy) &&
-      isInstructionPairAdd(Real, Imag)) {
-    return identifyAdd(Real, Imag);
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyMultiplications(
+    std::list<Product> &RealMuls, std::list<Product> &ImagMuls,
+    NodePtr Accumulator = nullptr) {
+  if (RealMuls.size() % 2 || ImagMuls.size() % 2)
+    return nullptr;
+
+  if (RealMuls.size() != ImagMuls.size())
+    return nullptr;
+
+  NodePtr Result = Accumulator;
+  while (!RealMuls.empty()) {
+    ComplexDeinterleavingRotation Rotation1, Rotation2;
+    auto [NodeA, NodeB] =
+        extractOneMul(RealMuls, ImagMuls, Rotation1, Rotation2);
+    if (!NodeA || !NodeB)
+      return nullptr;
+
+    NodePtr NodeMul1 = prepareCompositeNode(
+        ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr);
+    NodeMul1->Rotation = Rotation1;
+    NodeMul1->addOperand(NodeA);
+    NodeMul1->addOperand(NodeB);
+    if (Result)
+      NodeMul1->addOperand(Result);
+    submitCompositeNode(NodeMul1);
+
+    NodePtr NodeMul2 = prepareCompositeNode(
+        ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr);
+    NodeMul2->Rotation = Rotation2;
+    NodeMul2->addOperand(NodeA);
+    NodeMul2->addOperand(NodeB);
+    NodeMul2->addOperand(NodeMul1);
+    submitCompositeNode(NodeMul2);
+    Result = NodeMul2;
   }
+  return Result;
+}
 
-  auto Symmetric = identifySymmetricOperation(Real, Imag);
-  LLVM_DEBUG(if (Symmetric == nullptr) dbgs()
-             << "  - Not recognised as a valid pattern.\n");
-  return Symmetric;
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyAdditions(std::list<Addend> &RealAddends,
+                                              std::list<Addend> &ImagAddends,
+                                              NodePtr Accumulator = nullptr) {
+  if (RealAddends.size() != ImagAddends.size())
+    return nullptr;
+
+  NodePtr Result;
+  // If we have accumulator use it as first addend
+  if (Accumulator) {
+    Result = Accumulator;
+    // Otherwise find an element with both positive real and imaginary parts.
+  } else {
+    Result = extractPositiveAddend(RealAddends, ImagAddends);
+  }
+
+  if (!Result)
+    return nullptr;
+
+  while (!RealAddends.empty()) {
+    auto ItR = RealAddends.begin();
+    auto [R, IsPositiveR] = *ItR;
+
+    bool FoundImag = false;
+    for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+      auto [I, IsPositiveI] = *ItI;
+      ComplexDeinterleavingRotation Rotation;
+      if (IsPositiveR && IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_0;
+      else if (!IsPositiveR && IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_90;
+      else if (!IsPositiveR && !IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_180;
+      else
+        Rotation = ComplexDeinterleavingRotation::Rotation_270;
+
+      NodePtr AddNode;
+      if (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+          Rotation == ComplexDeinterleavingRotation::Rotation_180) {
+        AddNode = identifyNode(R, I);
+      } else {
+        AddNode = identifyNode(I, R);
+      }
+      if (AddNode) {
+        LLVM_DEBUG({
+          dbgs() << "Identified addend:\n";
+          dbgs().indent(4) << "X: " << *R << "\n";
+          dbgs().indent(4) << "Y: " << *I << "\n";
+          dbgs().indent(4) << "Rotation - " << (int)Rotation << "\n";
+        });
+
+        NodePtr TmpNode = prepareCompositeNode(
+            ComplexDeinterleavingOperation::CAdd, nullptr, nullptr);
+        TmpNode->Rotation = Rotation;
+        TmpNode->addOperand(Result);
+        TmpNode->addOperand(AddNode);
+        submitCompositeNode(TmpNode);
+        Result = TmpNode;
+        RealAddends.erase(ItR);
+        ImagAddends.erase(ItI);
+        FoundImag = true;
+        break;
+      }
+    }
+    if (!FoundImag)
+      return nullptr;
+  }
+  return Result;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::extractPositiveAddend(
+    std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) {
+  for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) {
+    for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+      auto [R, IsPositiveR] = *ItR;
+      auto [I, IsPositiveI] = *ItI;
+      if (IsPositiveR && IsPositiveI) {
+        auto Result = identifyNode(R, I);
+        if (Result) {
+          RealAddends.erase(ItR);
+          ImagAddends.erase(ItI);
+          return Result;
+        }
+      }
+    }
+  }
+  return nullptr;
 }
 
 bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
@@ -1010,7 +1411,8 @@
   return submitCompositeNode(PlaceholderNode);
 }
 
-static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node,
+static Value *replaceSymmetricNode(IRBuilderBase &B,
+                                   ComplexDeinterleavingGraph::RawNodePtr Node,
                                    Value *InputA, Value *InputB) {
   Instruction *I = Node->Real;
   if (I->isUnaryOp())
@@ -1020,8 +1422,6 @@
     assert(InputB && "Binary symmetric operations need two inputs, only one "
                      "was provided.");
 
-  IRBuilder<> B(I);
-
   switch (I->getOpcode()) {
   case Instruction::FNeg:
     return B.CreateFNegFMF(InputA, I);
@@ -1037,26 +1437,25 @@
 }
 
 Value *ComplexDeinterleavingGraph::replaceNode(
-    ComplexDeinterleavingGraph::RawNodePtr Node) {
+    IRBuilderBase &IRB, ComplexDeinterleavingGraph::RawNodePtr Node) {
   if (Node->ReplacementNode)
     return Node->ReplacementNode;
 
-  Value *Input0 = replaceNode(Node->Operands[0]);
+  Value *Input0 = replaceNode(IRB, Node->Operands[0]);
   Value *Input1 =
-      Node->Operands.size() > 1 ? replaceNode(Node->Operands[1]) : nullptr;
+      Node->Operands.size() > 1 ? replaceNode(IRB, Node->Operands[1]) : nullptr;
   Value *Accumulator =
-      Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr;
+      Node->Operands.size() > 2 ? replaceNode(IRB, Node->Operands[2]) : nullptr;
 
   if (Input1)
     assert(Input0->getType() == Input1->getType() &&
            "Node inputs need to be of the same type");
 
   if (Node->Operation == ComplexDeinterleavingOperation::Symmetric)
-    Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1);
+    Node->ReplacementNode = replaceSymmetricNode(IRB, Node, Input0, Input1);
   else
     Node->ReplacementNode = TL->createComplexDeinterleavingIR(
-        Node->Real, Node->Operation, Node->Rotation, Input0, Input1,
-        Accumulator);
+        IRB, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
 
   assert(Node->ReplacementNode && "Target failed to create Intrinsic call.");
   NumComplexTransformations += 1;
@@ -1073,7 +1472,7 @@
 
     IRBuilder<> Builder(RootInstruction);
     auto RootNode = RootToNode[RootInstruction];
-    Value *R = replaceNode(RootNode.get());
+    Value *R = replaceNode(Builder, RootNode.get());
     assert(R && "Unable to find replacement for RootInstruction");
     DeadInstrRoots.push_back(RootInstruction);
     RootInstruction->replaceAllUsesWith(R);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
@@ -838,7 +839,7 @@
       ComplexDeinterleavingOperation Operation, Type *Ty) const override;
 
   Value *createComplexDeinterleavingIR(
-      Instruction *I, ComplexDeinterleavingOperation OperationType,
+      IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
       ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
       Value *Accumulator = nullptr) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24646,14 +24646,12 @@
 }
 
 Value *AArch64TargetLowering::createComplexDeinterleavingIR(
-    Instruction *I, ComplexDeinterleavingOperation OperationType,
+    IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
   VectorType *Ty = cast<VectorType>(InputA->getType());
   bool IsScalable = Ty->isScalableTy();
 
-  IRBuilder<> B(I);
-
   unsigned TyWidth =
       Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
 
@@ -24677,9 +24675,9 @@
           B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
     }
     auto *LowerSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+        B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+        B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
     auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
                                         B.getInt64(0));
@@ -24708,14 +24706,16 @@
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+    if (Rotation == ComplexDeinterleavingRotation::Rotation_0) {
+      return B.CreateFAdd(InputA, InputB);
+    } else if (Rotation == ComplexDeinterleavingRotation::Rotation_180) {
+      return B.CreateFSub(InputA, InputB);
+    }
     if (IsScalable) {
       auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true));
-      if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
-          Rotation == ComplexDeinterleavingRotation::Rotation_270)
-        return B.CreateIntrinsic(
-            Intrinsic::aarch64_sve_fcadd, Ty,
-            {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
-      return nullptr;
+      return B.CreateIntrinsic(
+          Intrinsic::aarch64_sve_fcadd, Ty,
+          {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
     }
 
     Intrinsic::ID IntId = Intrinsic::not_intrinsic;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -747,7 +747,7 @@
         ComplexDeinterleavingOperation Operation, Type *Ty) const override;
 
     Value *createComplexDeinterleavingIR(
-        Instruction *I, ComplexDeinterleavingOperation OperationType,
+        IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
         ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
         Value *Accumulator = nullptr) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -22049,14 +22049,12 @@
 }
 
 Value *ARMTargetLowering::createComplexDeinterleavingIR(
-    Instruction *I, ComplexDeinterleavingOperation OperationType,
+    IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
 
   FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
 
-  IRBuilder<> B(I);
-
   unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
 
   assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
@@ -22081,9 +22079,9 @@
     }
 
     auto *LowerSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+        B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+        B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
     ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
     return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
@@ -22103,6 +22101,12 @@
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+    if (Rotation == ComplexDeinterleavingRotation::Rotation_0) {
+      return B.CreateFAdd(InputA, InputB);
+    } else if (Rotation == ComplexDeinterleavingRotation::Rotation_180) {
+      return B.CreateFSub(InputA, InputB);
+    }
+
     // 1 means the value is not halved.
     auto *ConstHalving = ConstantInt::get(IntTy, 1);
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
@@ -7,18 +7,12 @@
 define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
 ; CHECK-LABEL: mull_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    zip2 v6.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip1 v7.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip1 v1.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip1 v4.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fmla v6.2d, v0.2d, v4.2d
-; CHECK-NEXT:    fmla v1.2d, v7.2d, v4.2d
-; CHECK-NEXT:    fmla v6.2d, v7.2d, v2.2d
-; CHECK-NEXT:    fmls v1.2d, v0.2d, v2.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v6.2d
-; CHECK-NEXT:    zip2 v1.2d, v1.2d, v6.2d
+; CHECK-NEXT:    fcmla v4.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v5.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v4.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v5.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    mov v1.16b, v5.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -43,25 +37,18 @@
 define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    zip1 v16.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip1 v17.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip2 v1.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip1 v2.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip2 v3.2d, v4.2d, v5.2d
-; CHECK-NEXT:    fmul v4.2d, v16.2d, v0.2d
-; CHECK-NEXT:    zip1 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT:    zip2 v6.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    fmul v7.2d, v16.2d, v17.2d
-; CHECK-NEXT:    fmla v4.2d, v17.2d, v1.2d
-; CHECK-NEXT:    fmla v0.2d, v3.2d, v6.2d
-; CHECK-NEXT:    fmla v7.2d, v2.2d, v5.2d
-; CHECK-NEXT:    fmla v4.2d, v3.2d, v5.2d
-; CHECK-NEXT:    fsub v1.2d, v7.2d, v0.2d
-; CHECK-NEXT:    fmla v4.2d, v2.2d, v6.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v4.2d
-; CHECK-NEXT:    zip2 v1.2d, v1.2d, v4.2d
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -94,26 +81,18 @@
 define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    zip1 v17.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip1 v18.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip2 v1.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip2 v2.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip1 v3.2d, v6.2d, v7.2d
-; CHECK-NEXT:    zip1 v16.2d, v4.2d, v5.2d
-; CHECK-NEXT:    fmul v4.2d, v17.2d, v0.2d
-; CHECK-NEXT:    fmul v5.2d, v17.2d, v18.2d
-; CHECK-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    zip2 v6.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmul v7.2d, v3.2d, v2.2d
-; CHECK-NEXT:    fmla v4.2d, v18.2d, v1.2d
-; CHECK-NEXT:    fmla v0.2d, v16.2d, v3.2d
-; CHECK-NEXT:    fmla v5.2d, v2.2d, v6.2d
-; CHECK-NEXT:    fmla v7.2d, v16.2d, v6.2d
-; CHECK-NEXT:    fsub v1.2d, v5.2d, v0.2d
-; CHECK-NEXT:    fsub v2.2d, v4.2d, v7.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v2.2d
-; CHECK-NEXT:    zip2 v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #270
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #270
+; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #180
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #180
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -146,25 +125,18 @@
 define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    zip2 v16.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip2 v17.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip1 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    fmul v3.2d, v16.2d, v17.2d
-; CHECK-NEXT:    fmul v1.2d, v2.2d, v17.2d
-; CHECK-NEXT:    zip1 v17.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip2 v4.2d, v4.2d, v5.2d
-; CHECK-NEXT:    fneg v3.2d, v3.2d
-; CHECK-NEXT:    zip1 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmla v1.2d, v0.2d, v16.2d
-; CHECK-NEXT:    fmla v3.2d, v0.2d, v2.2d
-; CHECK-NEXT:    zip2 v0.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmls v1.2d, v4.2d, v5.2d
-; CHECK-NEXT:    fmla v3.2d, v17.2d, v5.2d
-; CHECK-NEXT:    fmla v1.2d, v17.2d, v0.2d
-; CHECK-NEXT:    fmla v3.2d, v4.2d, v0.2d
-; CHECK-NEXT:    zip1 v0.2d, v3.2d, v1.2d
-; CHECK-NEXT:    zip2 v1.2d, v3.2d, v1.2d
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v6.2d, v4.2d, #270
+; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #270
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -6,21 +6,13 @@
 define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
 ; CHECK-LABEL: mull_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp2 z6.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z7.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z4.d, z7.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    movprfx z5, z6
-; CHECK-NEXT:    fmla z5.d, p0/m, z4.d, z0.d
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    fmla z3.d, p0/m, z2.d, z7.d
-; CHECK-NEXT:    fmls z1.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z3.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z3.d
+; CHECK-NEXT:    fcmla z4.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z5.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z4.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z5.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -47,26 +39,19 @@
 define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z26.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z1.d, z1.d, z25.d
-; CHECK-NEXT:    fmul z0.d, z24.d, z0.d
-; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    uzp2 z5.d, z6.d, z7.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z26.d, z4.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z26.d, z3.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    fsub z1.d, z1.d, z0.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -102,27 +87,19 @@
 define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z1.d, z0.d
-; CHECK-NEXT:    fmul z1.d, z1.d, z25.d
-; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z5.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z6.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z0.d, z24.d, z0.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z6.d, z3.d
-; CHECK-NEXT:    fmul z3.d, z5.d, z3.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z4.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT:    fsub z1.d, z1.d, z0.d
-; CHECK-NEXT:    fsub z2.d, z2.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #270
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #270
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #180
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #180
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -158,26 +135,19 @@
 define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmul z0.d, z24.d, z0.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z5.d, z6.d, z7.d
-; CHECK-NEXT:    fnmls z0.d, p0/m, z1.d, z25.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmls z1.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    uzp2 z2.d, z6.d, z7.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z4.d
-; CHECK-NEXT:    fmad z3.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    zip1 z0.d, z3.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z3.d, z1.d
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #270
+; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
@@ -220,11 +220,11 @@
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #0
-; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #0
-; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #90
-; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #90
-; CHECK-NEXT:    fcadd v0.4s, v4.4s, v3.4s, #90
+; CHECK-NEXT:    fcmla v3.4s, v2.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v2.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    fcadd v0.4s, v3.4s, v4.4s, #90
 ; CHECK-NEXT:    ret
 entry:
   %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
@@ -484,9 +484,9 @@
 ; CHECK-LABEL: mul_negequal:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #0
-; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #90
-; CHECK-NEXT:    fneg v0.4s, v2.4s
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #180
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #270
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -115,15 +115,7 @@
 define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: simple_add_270_false:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    zip1 v4.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    fadd v1.2s, v1.2s, v4.2s
-; CHECK-NEXT:    fsub v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fcadd v0.4s, v0.4s, v1.4s, #270
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>