diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -0,0 +1,40 @@
+//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements generation of target-specific intrinsics to support
+// handling of complex number arithmetic and deinterleaving.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H
+#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+
+struct ComplexDeinterleavingPass
+    : public PassInfoMixin<ComplexDeinterleavingPass> {
+private:
+  TargetMachine *TM;
+
+public:
+  ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+enum class ComplexDeinterleavingOperation { None, CAdd, CMulPartial };
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -80,6 +80,10 @@
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass();
 
+  /// This pass implements generation of target-specific intrinsics to support
+  /// handling of complex number arithmetic
+  FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM);
+
   /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
   /// load-linked/store-conditional loops.
   extern char &AtomicExpandID;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -22,6 +22,7 @@
 #ifndef LLVM_CODEGEN_TARGETLOWERING_H
 #define LLVM_CODEGEN_TARGETLOWERING_H
 
+#include "ComplexDeinterleavingPass.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -3051,6 +3052,27 @@
     return isOperationLegalOrCustom(Op, VT);
   }
 
+  /// Does this target support complex deinterleaving
+  virtual bool isComplexDeinterleavingSupported() const { return false; }
+
+  /// Does this target support complex deinterleaving with the given operation
+  /// and type
+  virtual bool isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation Operation, Type *Ty) const {
+    return false;
+  }
+
+  /// Create the IR node for the given complex deinterleaving operation.
+  /// If one cannot be created using all the given inputs, nullptr should be
+  /// returned.
+  virtual Value *
+  createComplexDeinterleavingIR(Instruction *I,
+                                ComplexDeinterleavingOperation OperationType,
+                                unsigned Rotation, Value *InputA, Value *InputB,
+                                Value *Accumulator = nullptr) const {
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -101,26 +101,27 @@
 void initializeCFIFixupPass(PassRegistry&);
 void initializeCFIInstrInserterPass(PassRegistry&);
 void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
-void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
+void initializeCFLSteensAAWrapperPassPass(PassRegistry &);
 void initializeCGProfileLegacyPassPass(PassRegistry &);
-void initializeCallGraphDOTPrinterPass(PassRegistry&);
-void initializeCallGraphPrinterLegacyPassPass(PassRegistry&);
-void initializeCallGraphViewerPass(PassRegistry&);
-void initializeCallGraphWrapperPassPass(PassRegistry&);
-void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
+void initializeCallGraphDOTPrinterPass(PassRegistry &);
+void initializeCallGraphPrinterLegacyPassPass(PassRegistry &);
+void initializeCallGraphViewerPass(PassRegistry &);
+void initializeCallGraphWrapperPassPass(PassRegistry &);
+void initializeCallSiteSplittingLegacyPassPass(PassRegistry &);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
 void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPreparePass(PassRegistry&);
-void initializeConstantHoistingLegacyPassPass(PassRegistry&);
-void initializeConstantMergeLegacyPassPass(PassRegistry&);
+void initializeCodeGenPreparePass(PassRegistry &);
+void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
+void initializeConstantHoistingLegacyPassPass(PassRegistry &);
+void initializeConstantMergeLegacyPassPass(PassRegistry &);
 void initializeConstraintEliminationPass(PassRegistry &);
-void initializeControlHeightReductionLegacyPassPass(PassRegistry&);
-void initializeCorrelatedValuePropagationPass(PassRegistry&);
-void initializeCostModelAnalysisPass(PassRegistry&);
-void initializeCrossDSOCFIPass(PassRegistry&);
+void initializeControlHeightReductionLegacyPassPass(PassRegistry &);
+void initializeCorrelatedValuePropagationPass(PassRegistry &);
+void initializeCostModelAnalysisPass(PassRegistry &);
+void initializeCrossDSOCFIPass(PassRegistry &);
 void initializeCycleInfoWrapperPassPass(PassRegistry &);
-void initializeDAEPass(PassRegistry&);
-void initializeDAHPass(PassRegistry&);
+void initializeDAEPass(PassRegistry &);
+void initializeDAHPass(PassRegistry &);
 void initializeDCELegacyPassPass(PassRegistry&);
 void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &);
 void initializeDSELegacyPassPass(PassRegistry&);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -46,6 +46,7 @@
   CodeGenPassBuilder.cpp
   CodeGenPrepare.cpp
   CommandFlags.cpp
+        ComplexDeinterleavingPass.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
   DetectDeadLanes.cpp
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -0,0 +1,1092 @@
+//===- ComplexDeinterleavingPass.cpp
+//------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is broken down into 3 steps; Discovery, Identification, and
+// Replacement.
+//
+// Discovery:
+// The discovery step is responsible for finding the search space for the
+// complex graph. The first Instruction pointer it takes is assumed to be the
+// converging shuffle of the complex graph (identified by the mask representing
+// an interleaving pattern. e.g. `<0, 2, 1, 3>`), and ascends through the
+// operands depth-first to find the respective deinterleaving shuffles
+// (identified by the mask being `<0, 2, 4, 6>` or `<1, 3, 5, 7>`).
+// Beyond `Instructions[0]` being the converging shuffle, this step makes no
+// guarantees as to the order of `Instructions`.
+//
+// Identification:
+// This step is responsible for finding the patterns that can be lowered to
+// complex instructions. Iterating over `Instructions`, it first performs some
+// pattern matching to find a predictable partial multiply case, performing some
+// analysis on the order and operating component of the operands to identify
+// which rotation around the argand plane is represented by the pattern. The
+// step then attempts to pair up "Orphaned" instructions (instructions that have
+// no shared parent that would be part of the same node, e.g. an add and a sub
+// that represent a complex add). After attempting to pair orphaned
+// instructions, the presence of any instructions outside of composite nodes
+// means that the graph cannot be lowered confidently, causing the pass to stop
+// analysing the current graph. If it continues, the composite nodes are sorted
+// to reflect the underlying instruction order, and the uses are checked to find
+// any accumulator cases.
+//
+// Replacement:
+// This step performs the necessary input wrangling (chasing values through
+// accumulators, shuffles, and other composite nodes) in order for the target to
+// know what to generate. While some additional checks are performed at this
+// step, it is expected to finish successfully, while any errors should be
+// caught via asserts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "complex-deinterleaving"
+
+STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated");
+
+static cl::opt<bool> ComplexArithmeticEnabled(
+    "enable-complex-arithmetic",
+    cl::desc("Enable generation of complex arithmetic instructions"),
+    cl::init(true), cl::Hidden);
+
+/// Checks the given mask, and determines whether said mask is interleaving.
+///
+/// To be interleaving, a mask must alternate between `i` and `i + (Length /
+/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a
+/// 4x vector interleaving mask would be <0, 2, 1, 3>).
+static bool isInterleavingMask(ArrayRef<int> Mask);
+/// Checks the given mask, and determines whether said mask is deinterleaving.
+///
+/// To be interleaving, a mask must increment in steps of 2, and either start
+/// with 0 or 1.
+/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or
+/// <1, 3, 5, 7>).
+static bool isDeinterleavingMask(ArrayRef<int> Mask);
+
+namespace {
+
+/// Creates an integer array of length \p len, where each item is \p step more
+/// than the previous. An offset can be provided to specify the first element.
+static SmallVector<int> createArrayWithStep(int len, int step, int offset = 0) {
+  SmallVector<int> Arr(len);
+  for (int j = 0; j < len; j++)
+    Arr[j] = (j * step) + offset;
+  return Arr;
+}
+
+/// Creates a deinterleaving mask of the given length at the given offset.
+/// A deinterleaving mask looks like <0, 2, 4, 6> or <1, 3, 5, 7>
+static SmallVector<int> createDeinterleavingMask(int len, int offset = 0) {
+  return createArrayWithStep(len, 2, offset);
+}
+
+class ComplexDeinterleavingLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+    initializeComplexDeinterleavingLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Complex Arithmetic Pass"; }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  const TargetMachine *TM;
+};
+
+enum OperatingComponent { Real, Imaginary, Unknown };
+
+class ComplexDeinterleavingGraph;
+struct ComplexDeinterleavingCompositeNode {
+
+  ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op)
+      : Operation(Op) {}
+
+private:
+  friend class ComplexDeinterleavingGraph;
+
+public:
+  SmallVector<Value *> getOperands() {
+    SmallVector<Value *> Ops;
+
+    for (const auto &Inst : ContainedInstructions) {
+      for (Value *V : Inst->operands()) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (!I || !contains(I)) {
+          Ops.push_back(V);
+          continue;
+        }
+      }
+    }
+    return Ops;
+  }
+
+  Value *getOperand(unsigned Idx) { return getOperands()[Idx]; }
+
+  unsigned getNumOperands() { return getOperands().size(); }
+
+  SmallVector<Instruction *> ContainedInstructions;
+  Value *OutputNode = nullptr;
+  Value *OriginalInput0 = nullptr;
+  Value *OriginalInput1 = nullptr;
+  Value *ReplacementNode = nullptr;
+  bool IsTopLevel = false;
+  ComplexDeinterleavingOperation Operation;
+
+  bool UsesNegation = false;
+  unsigned Rotation = 0;
+  Value *Input0 = nullptr;
+  Value *Input1 = nullptr;
+  Value *Accumulator = nullptr;
+  Value *Accumulatee = nullptr;
+
+  void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); }
+  bool contains(Instruction *I) {
+    if (I == ReplacementNode)
+      return true;
+
+    return llvm::find(ContainedInstructions, I) != ContainedInstructions.end();
+  }
+};
+
+class ComplexDeinterleavingGraph {
+private:
+  using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
+
+  SmallVector<Instruction *> Instructions;
+  SmallVector<NodePtr> CompositeNodes;
+
+  llvm::TargetTransformInfo::TargetCostKind CostKind =
+      llvm::TargetTransformInfo::TCK_Latency;
+
+  InstructionCost CostOfIntrinsics;
+
+  /// Determines the operating component of the given Value.
+  /// This is achieved by looking at the operating component of the Value's
+  /// operands and, based on the instruction, evaluates what the resulting
+  /// component would be.
+  OperatingComponent getOperatingComponentOfValue(Value *V) {
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
+    if (!I)
+      return Unknown;
+
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+      auto ShuffleMask = Shuffle->getShuffleMask();
+      if (isDeinterleavingMask(ShuffleMask)) {
+        if (ShuffleMask[0] == 0)
+          return Real;
+        if (ShuffleMask[0] == 1)
+          return Imaginary;
+      }
+      return Unknown;
+    }
+
+    if (I->getOpcode() == Instruction::FMul) {
+      auto Op0Component = getOperatingComponentOfValue(I->getOperand(0));
+      auto Op1Component = getOperatingComponentOfValue(I->getOperand(1));
+      if (Op0Component == Unknown || Op1Component == Unknown)
+        return Unknown;
+      if (Op0Component == Op1Component)
+        return Real;
+      return Imaginary;
+    }
+
+    if (I->getOpcode() == Instruction::FNeg)
+      return getOperatingComponentOfValue(I->getOperand(0));
+
+    if (I->getOpcode() == Instruction::FAdd ||
+        I->getOpcode() == Instruction::FSub) {
+      auto Op0Component = getOperatingComponentOfValue(I->getOperand(0));
+      auto Op1Component = getOperatingComponentOfValue(I->getOperand(1));
+      if (Op0Component != Op1Component || Op1Component == Unknown)
+        return Unknown;
+      return Op0Component;
+    }
+
+    return Unknown;
+  }
+
+  void addInstruction(Instruction *I) { Instructions.push_back(I); }
+
+  void sortCompositeNodes(BasicBlock *B) {
+    SmallVector<NodePtr> NewNodeList;
+
+    // Sort the nodelist based on the instruction order
+    for (auto &I : *B) {
+      if (auto CN = findNodeFromOutput(&I))
+        NewNodeList.push_back(CN);
+    }
+
+    for (unsigned i = 0; i < NewNodeList.size(); i++)
+      CompositeNodes[i] = NewNodeList[i];
+  }
+
+  NodePtr findNodeFromOutput(Instruction *I) {
+    for (const auto &Item : CompositeNodes) {
+      if (Item->OutputNode == I)
+        return Item;
+    }
+
+    return nullptr;
+  }
+
+  SmallVector<Instruction *> findUnmatchedInstructions() {
+    SmallVector<Instruction *> Is;
+    for (auto &I : Instructions) {
+      if (shouldIgnoreValue(I))
+        continue;
+      if (getContainingComposite(I) == nullptr)
+        Is.push_back(I);
+    }
+    return Is;
+  }
+
+  Value *getSharedOperand(Instruction *A, Instruction *B, unsigned &Idx) {
+    if (A->getNumOperands() != B->getNumOperands())
+      return nullptr;
+
+    for (unsigned OpIdx = 0; OpIdx < A->getNumOperands(); OpIdx++) {
+      auto *Op = A->getOperand(OpIdx);
+      if (Op == B->getOperand(OpIdx)) {
+        Idx = OpIdx;
+        return Op;
+      }
+    }
+    return nullptr;
+  }
+
+  bool haveSharedUses(Value *A, Value *B) {
+    if (A->hasOneUser() && B->hasOneUser()) {
+      auto *AUser = *A->user_begin();
+      auto *BUser = *B->user_begin();
+
+      if (AUser && AUser == BUser)
+        return true;
+
+      auto AUCN = getContainingComposite(dyn_cast<Instruction>(AUser));
+      auto BUCN = getContainingComposite(dyn_cast<Instruction>(BUser));
+
+      if (AUCN && AUCN == BUCN)
+        return true;
+    }
+
+    if (A->getNumUses() != B->getNumUses())
+      return false;
+
+    for (const auto &AUser : A->users()) {
+      bool Found = false;
+      auto AUCN = getContainingComposite(dyn_cast<Instruction>(AUser));
+      for (const auto &BUser : B->users()) {
+        if (AUser == BUser) {
+          Found = true;
+          break;
+        }
+        auto BUCN = getContainingComposite(dyn_cast<Instruction>(BUser));
+        if (AUCN && AUCN == BUCN) {
+          Found = true;
+          break;
+        }
+      }
+      if (!Found) {
+        LLVM_DEBUG(dbgs() << "AUser doesn't have a match: "; AUser->dump());
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  Value *followUseChain(Value *V) {
+    if (V->hasOneUser())
+      return followUseChain(*V->user_begin());
+
+    // TODO handle multiple users, but how?
+
+    return V;
+  }
+
+  Value *getFinalInputReplacement(Instruction *I) {
+    for (Value *V : I->operands()) {
+      auto *Op = dyn_cast<Instruction>(V);
+      while (Op && shouldIgnoreValue(Op))
+        Op = dyn_cast<Instruction>(Op->getOperand(0));
+      if (Op == nullptr)
+        continue;
+
+      auto CN = getContainingComposite(Op);
+      if (CN == nullptr || CN->ReplacementNode == nullptr)
+        continue;
+      return followUseChain(CN->ReplacementNode);
+    }
+
+    return nullptr;
+  }
+
+  Value *getReplacement(Instruction *I) {
+    if (!I)
+      return nullptr;
+    auto CN = getContainingComposite(I);
+    if (CN == nullptr || CN->ReplacementNode == nullptr)
+      return I;
+    return CN->ReplacementNode;
+  }
+
+  std::shared_ptr<ComplexDeinterleavingCompositeNode>
+  prepareCompositeNode(ComplexDeinterleavingOperation Operation) {
+    return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation);
+  }
+
+  void
+  submitCompositeNode(std::shared_ptr<ComplexDeinterleavingCompositeNode> CN) {
+    CompositeNodes.push_back(CN);
+  }
+
+  bool containsNode(Instruction *I) {
+    return llvm::find(Instructions, I) != Instructions.end();
+  }
+
+  /// Certain values, such as extends and truncates, should be ignored within
+  /// the graph for our needs as they contribute towards structure rather than
+  /// function.
+  ///
+  /// e.g. A deinterleaving shuffle provides no functionality itself,
+  /// and does not need to be explicitly handled beyond the usual operations. A
+  /// shuffle that is neither interleaving nor deinterleaving is an example of
+  /// one that needs to be handled, and thus should not be ignored.
+  bool shouldIgnoreValue(Value *V) {
+    if (isa<InsertElementInst>(V))
+      return true;
+
+    if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+      auto Mask = SVI->getShuffleMask();
+      return isInterleavingMask(Mask) || isDeinterleavingMask(Mask);
+    }
+
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      auto Opc = I->getOpcode();
+      return I->isCast() || Opc == Instruction::FPTrunc ||
+             Opc == Instruction::FPExt;
+    }
+    return false;
+  }
+
+  /// Checks the users of the given instructions to evaluate whether the
+  /// returns from said instructions converge at any point. e.g. in a shuffle
+  bool doInstructionsConverge(Instruction *A, Instruction *B) {
+    if (A->hasOneUser() && B->hasOneUser()) {
+      auto *AUser = *A->user_begin();
+      auto *BUser = *B->user_begin();
+
+      while (shouldIgnoreValue(AUser))
+        AUser = *AUser->user_begin();
+      while (shouldIgnoreValue(BUser))
+        BUser = *BUser->user_begin();
+
+      if (AUser == BUser)
+        return true;
+    }
+
+    return haveSharedUses(A, B);
+  }
+
+  NodePtr getContainingComposite(Instruction *I) {
+    if (I == nullptr)
+      return nullptr;
+    for (const auto &CN : CompositeNodes) {
+      if (CN->contains(I))
+        return CN;
+      if (CN->ReplacementNode == I)
+        return CN;
+    }
+    return nullptr;
+  }
+
+  bool identifyCMulPartial(Instruction *I, const TargetLowering *TL,
+                           bool &ContinueIdentification);
+  bool identifyOrphanedCMulPartial(Instruction *I, Instruction *J,
+                                   const TargetLowering *TL,
+                                   bool &ContinueIdentification);
+  bool identifyCAdd(Instruction *I, Instruction *J, const TargetLowering *TL,
+                    bool &ContinueIdentification);
+
+public:
+  /// Step through the use-def chains to find all instruction nodes converging
+  /// on \p I.
+  void discoverNodes(BasicBlock *B, Instruction *I);
+  /// Iterate over the nodes and reducing them to complex nodes where possible.
+  /// Returns false if the deinterleaving operation should be cancelled for the
+  /// current graph.
+  bool identifyNodes(const TargetLowering *TL);
+  /// Perform the actual replacement of the underlying instruction graph.
+  /// Returns false if the deinterleaving operation should be cancelled for the
+  /// current graph.
+  bool replaceNodes(const TargetLowering *TL);
+  void getDeadRoots(SmallVector<Instruction *> &DeadInstRoots);
+};
+
+class ComplexDeinterleaving {
+public:
+  ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli)
+      : TL(tl), TLI(tli) {}
+  bool runOnFunction(Function &F);
+
+private:
+  bool evaluateBasicBlock(BasicBlock *B);
+
+  const TargetLowering *TL = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+};
+
+} // namespace
+
+char ComplexDeinterleavingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                      "Complex Deinterleaving", false, false)
+INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                    "Complex Deinterleaving", false, false)
+
+PreservedAnalyses ComplexDeinterleavingPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto &TLI = AM.getResult<llvm::TargetLibraryAnalysis>(F);
+  if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) {
+  return new ComplexDeinterleavingLegacyPass(TM);
+}
+
+bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) {
+  const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return ComplexDeinterleaving(TL, &TLI).runOnFunction(F);
+}
+
+bool ComplexDeinterleaving::runOnFunction(Function &F) {
+  if (!ComplexArithmeticEnabled) {
+    LLVM_DEBUG(dbgs() << "Complex has been explicitly disabled.\n");
+    return false;
+  }
+
+  if (!TL->isComplexDeinterleavingSupported()) {
+    LLVM_DEBUG(dbgs() << "Complex has been disabled, target does not support "
+                         "lowering of complex numbers.\n");
+    return false;
+  }
+
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= evaluateBasicBlock(&B);
+
+  return Changed;
+}
+
+static bool isInterleavingMask(ArrayRef<int> Mask) {
+  int HalfNumElements = Mask.size() / 2;
+
+  for (int Idx = 0; Idx < HalfNumElements; ++Idx) {
+    if (Mask[(Idx * 2) + 1] != (Mask[Idx * 2] + HalfNumElements))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isDeinterleavingMask(ArrayRef<int> Mask) {
+  int Offset = Mask[0];
+  int HalfNumElements = Mask.size() / 2;
+
+  for (int Idx = 1; Idx < HalfNumElements; ++Idx) {
+    if (Mask[Idx] != (Idx * 2) + Offset)
+      return false;
+  }
+
+  return true;
+}
+
+bool ComplexDeinterleaving::evaluateBasicBlock(
+    BasicBlock *B) {
+  bool Changed = false;
+
+  SmallVector<Instruction *> DeadInstrRoots;
+
+  for (auto &I : *B) {
+    if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+      // Look for a shufflevector that takes separate vectors of the real and
+      // imaginary components and recombines them into a single vector.
+      if (isInterleavingMask(SVI->getShuffleMask())) {
+        ComplexDeinterleavingGraph Graph;
+        Graph.discoverNodes(B, SVI);
+        if (Graph.identifyNodes(TL) && Graph.replaceNodes(TL)) {
+          Changed = true;
+          DeadInstrRoots.push_back(SVI);
+        } else {
+          SmallVector<Instruction *> DeadInstrs;
+          Graph.getDeadRoots(DeadInstrs);
+          for (auto It = DeadInstrs.rbegin(); It != DeadInstrs.rend(); It++)
+            (*It)->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  for (const auto &I : DeadInstrRoots)
+    llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+
+  return Changed;
+}
+
+bool ComplexDeinterleavingGraph::identifyCMulPartial(
+    Instruction *I, const TargetLowering *TL, bool &ContinueIdentification) {
+  if ((match(I, m_FAdd(m_FMul(m_Value(), m_Value()),
+                       m_FMul(m_Value(), m_Value()))) ||
+       match(I, m_FSub(m_FMul(m_Value(), m_Value()),
+                       m_FMul(m_Value(), m_Value()))))) {
+
+    auto *VTy = dyn_cast<FixedVectorType>(I->getType());
+    if (!VTy)
+      return false;
+
+    auto *NewVTy =
+        FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);
+
+    if (!TL->isComplexDeinterleavingOperationSupported(
+            ComplexDeinterleavingOperation::CMulPartial, NewVTy))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Composite node built up from "; N->dump());
+    auto CN =
+        prepareCompositeNode(llvm::ComplexDeinterleavingOperation::CMulPartial);
+
+    auto *Op0 = cast<Instruction>(I->getOperand(0));
+    auto *Op1 = cast<Instruction>(I->getOperand(1));
+
+    CN->addInstruction(I);
+    CN->addInstruction(Op0);
+    CN->addInstruction(Op1);
+
+    CN->OriginalInput0 = Op0;
+    CN->OriginalInput1 = Op1;
+
+    bool ContainsNeg = false;
+    for (Value *V : Op0->operands()) {
+      auto *Op = dyn_cast<Instruction>(V);
+      if (Op && Op->getOpcode() == Instruction::FNeg) {
+        if (ContainsNeg)
+          break;
+        CN->addInstruction(Op);
+        CN->OriginalInput0 = Op;
+        ContainsNeg = true;
+      }
+    }
+    for (Value *V : Op1->operands()) {
+      auto *Op = dyn_cast<Instruction>(V);
+      if (Op && Op->getOpcode() == Instruction::FNeg) {
+        if (ContainsNeg)
+          break;
+        CN->addInstruction(Op);
+        CN->OriginalInput1 = Op;
+        ContainsNeg = true;
+      }
+    }
+
+    if (!ContainsNeg) {
+      auto &Use = (*I->use_begin());
+      if (I->getOpcode() == Instruction::FSub) {
+        if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 0) {
+          LLVM_DEBUG(dbgs()
+                     << "First converging shuffle operand should be an FSub"
+                     << ".\n");
+          ContinueIdentification = false;
+          return false;
+        }
+      } else if (I->getOpcode() == Instruction::FAdd) {
+        if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 1) {
+          LLVM_DEBUG(dbgs()
+                     << "Second converging shuffle operand should be an FAdd"
+                     << ".\n");
+          return false;
+        }
+      }
+    }
+
+    auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()),
+                           m_Shuffle(m_Value(), m_Value()));
+    CN->IsTopLevel = match(CN->OriginalInput0, Pattern) &&
+                     match(CN->OriginalInput1, Pattern);
+    CN->UsesNegation = ContainsNeg;
+    CN->OutputNode = I;
+
+    CN->Rotation = (I->getOpcode() == Instruction::FAdd) * 90;
+
+    if (I->getOpcode() == Instruction::FSub) {
+      auto *SubOp0 = cast<Instruction>(I->getOperand(0));
+      auto SubOp0C0 = getOperatingComponentOfValue(SubOp0->getOperand(0));
+      auto SubOp0C1 = getOperatingComponentOfValue(SubOp0->getOperand(1));
+
+      if (SubOp0C0 == SubOp0C1) {
+        if (SubOp0C0 == OperatingComponent::Imaginary) {
+          CN->Rotation += 90;
+        }
+      }
+    }
+
+    if (CN->UsesNegation)
+      CN->Rotation += 180;
+
+    submitCompositeNode(CN);
+    return true;
+  }
+  ContinueIdentification = true;
+  return false;
+}
+
+bool ComplexDeinterleavingGraph::identifyOrphanedCMulPartial(
+    Instruction *I, Instruction *J, const TargetLowering *TL,
+    bool &ContinueIdentification) {
+  if ((I->getOpcode() == Instruction::FMul &&
+       J->getOpcode() == Instruction::FMul)) {
+
+    // At this point, all operands should be instructions
+    if (!isa<Instruction>(I->getOperand(0)) ||
+        !isa<Instruction>(I->getOperand(1)))
+      return false;
+    if (!isa<Instruction>(J->getOperand(0)) ||
+        !isa<Instruction>(J->getOperand(1)))
+      return false;
+
+    auto *VTy = dyn_cast<FixedVectorType>(I->getType());
+    if (!VTy)
+      return false;
+
+    auto *NewVTy =
+        FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);
+
+    if (!TL->isComplexDeinterleavingOperationSupported(
+            ComplexDeinterleavingOperation::CMulPartial, NewVTy))
+      return false;
+
+    // Partial mul
+    auto CN =
+        prepareCompositeNode(llvm::ComplexDeinterleavingOperation::CMulPartial);
+    CN->addInstruction(I);
+    CN->addInstruction(J);
+    CN->OriginalInput0 = I;
+    CN->OriginalInput1 = J;
+
+    unsigned SharedIdx;
+    auto *SharedOp =
+        dyn_cast_or_null<Instruction>(getSharedOperand(I, J, SharedIdx));
+    if (SharedOp) {
+      auto Opc = SharedOp->getOpcode();
+      if (Opc == Instruction::FNeg) {
+        if (SharedIdx == 0)
+          CN->OriginalInput0 = SharedOp;
+        else if (SharedIdx == 1)
+          CN->OriginalInput1 = SharedOp;
+        else {
+          LLVM_DEBUG(dbgs() << "Unknown input pattern, somehow the shared "
+                               "operand index is greater than 1.\n");
+          return false;
+        }
+        CN->addInstruction(SharedOp);
+        CN->UsesNegation = true;
+      }
+    }
+
+    auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()),
+                           m_Shuffle(m_Value(), m_Value()));
+    CN->IsTopLevel = match(CN->OriginalInput0, Pattern) &&
+                     match(CN->OriginalInput1, Pattern);
+    CN->OutputNode = J;
+    submitCompositeNode(CN);
+    return true;
+  }
+  ContinueIdentification = true;
+  return false;
+}
+
+bool ComplexDeinterleavingGraph::identifyCAdd(Instruction *I, Instruction *J,
+                                              const TargetLowering *TL,
+                                              bool &ContinueIdentification) {
+  if (((I->getOpcode() == Instruction::FSub &&
+        J->getOpcode() == Instruction::FAdd) ||
+       (I->getOpcode() == Instruction::FAdd &&
+        J->getOpcode() == Instruction::FSub))) {
+
+    auto *VTy = dyn_cast<FixedVectorType>(I->getType());
+    if (!VTy)
+      return false;
+
+    auto *NewVTy =
+        FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);
+
+    if (!TL->isComplexDeinterleavingOperationSupported(
+            ComplexDeinterleavingOperation::CAdd, NewVTy))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Pairing instructions as a CAdd.\n");
+    auto CN = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd);
+    CN->addInstruction(I);
+    CN->addInstruction(J);
+    CN->OriginalInput0 = I;
+    CN->OriginalInput1 = J;
+
+    auto *Sub = I->getOpcode() == Instruction::FSub ? I : J;
+    bool IsLikelyNegated = false;
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Sub->getOperand(0))) {
+      auto ShuffleMask = Shuffle->getShuffleMask();
+      if (isDeinterleavingMask(ShuffleMask))
+        IsLikelyNegated = ShuffleMask[0] == 1;
+    }
+
+    if (IsLikelyNegated) {
+      LLVM_DEBUG(dbgs() << "Negated adds are not yet supported.\n");
+      return false;
+    }
+
+    CN->UsesNegation = IsLikelyNegated;
+    CN->Rotation = 90;
+    if (I->getOpcode() == Instruction::FAdd)
+      CN->Rotation = 270;
+    CN->OutputNode = J;
+
+    Instruction *FAdd = I;
+    if (FAdd->getOpcode() != Instruction::FAdd)
+      FAdd = J;
+    if (getOperatingComponentOfValue(FAdd->getOperand(1)) !=
+        OperatingComponent::Real) {
+      LLVM_DEBUG(dbgs() << "CAdd.FAdd[1] should be the real component.\n");
+      return false;
+    }
+
+    submitCompositeNode(CN);
+    return true;
+  }
+  ContinueIdentification = true;
+  return false;
+}
+
+void ComplexDeinterleavingGraph::discoverNodes(BasicBlock *B, Instruction *I) {
+
+  if (I->getParent() != B)
+    return;
+
+  if (containsNode(I))
+    return;
+
+  if (isa<LoadInst>(I) || isa<PHINode>(I)) {
+    // No need to discover beyond a load or a phi.
+    return;
+  }
+
+  addInstruction(I);
+
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    static const int RealMask[] = {0,  2,  4,  6,  8,  10, 12, 14,
+                                   16, 18, 20, 22, 24, 26, 28, 30};
+    static const int ImagMask[] = {1,  3,  5,  7,  9,  11, 13, 15,
+                                   17, 19, 21, 23, 25, 27, 29, 31};
+
+    ArrayRef<int> RealMaskRef(RealMask, ShuffleMask.size());
+    ArrayRef<int> ImagMaskRef(ImagMask, ShuffleMask.size());
+
+    Value *ShuffleSource;
+    if (match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(),
+                             m_SpecificMask(RealMaskRef))) ||
+        match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(),
+                             m_SpecificMask(ImagMaskRef)))) {
+      // Reached "top" of graph, stop discovering.
+      // TODO this check needs refining
+      if (ShuffleSource &&
+          (isa<LoadInst>(ShuffleSource) || !isa<Instruction>(ShuffleSource)))
+        return;
+    }
+  }
+
+  for (const auto &Op : I->operands()) {
+    if (auto *OpI = dyn_cast<Instruction>(Op))
+      discoverNodes(B, OpI);
+  }
+}
+
+bool ComplexDeinterleavingGraph::identifyNodes(const TargetLowering *TL) {
+  if (Instructions.empty()) {
+    LLVM_DEBUG(dbgs() << "No Instructions, nothing to identify.\n");
+    return false;
+  }
+
+  auto *ConvergingI = Instructions[0];
+
+  for (auto &I : Instructions) {
+    bool ContinueIdentification = false;
+    if (!identifyCMulPartial(I, TL, ContinueIdentification) &&
+        !ContinueIdentification)
+      return false;
+  }
+
+  auto Unmatched = findUnmatchedInstructions();
+  SmallVector<Instruction *> Pairs;
+
+  for (auto &I : Unmatched) {
+    if (llvm::find(Pairs, I) != Pairs.end())
+      continue;
+    for (auto &J : Unmatched) {
+      if (I == J || llvm::find(Pairs, J) != Pairs.end())
+        continue;
+
+      if (doInstructionsConverge(I, J)) {
+        Pairs.push_back(I);
+        Pairs.push_back(J);
+        break;
+      }
+    }
+  }
+
+  // Try match found pairs
+  for (unsigned i = 0; i < Pairs.size(); i += 2) {
+    auto *I = Pairs[i];
+    auto *J = Pairs[i + 1];
+
+    bool ContinueIdentification = false;
+    if (!identifyOrphanedCMulPartial(I, J, TL, ContinueIdentification)) {
+      if (ContinueIdentification)
+        continue;
+      return false;
+    }
+
+    ContinueIdentification = false;
+    if (!identifyCAdd(I, J, TL, ContinueIdentification)) {
+      if (ContinueIdentification)
+        continue;
+      return false;
+    }
+  }
+
+  auto UnmatchedInstructions = findUnmatchedInstructions();
+  if (!UnmatchedInstructions.empty()) {
+    LLVM_DEBUG(dbgs() << "Unmatched instructions found in graph, cannot "
+                         "confidently generate complex intrinsics.\n";);
+    return false;
+  }
+
+  if (CompositeNodes.empty()) {
+    LLVM_DEBUG(dbgs() << "No composite nodes found.\n");
+    return false;
+  }
+
+  sortCompositeNodes(ConvergingI->getParent());
+
+  for (auto *It = CompositeNodes.begin() + 1; It != CompositeNodes.end();
+       It++) {
+    auto CN = *It;
+    auto PrevCN = *(It - 1);
+    if (haveSharedUses(CN->OutputNode, PrevCN->OutputNode)) {
+      CN->Accumulator = PrevCN->OutputNode;
+      PrevCN->Accumulatee = CN->OutputNode;
+    }
+  }
+
+  return true;
+}
+
+bool ComplexDeinterleavingGraph::replaceNodes(const TargetLowering *TL) {
+  if (CompositeNodes.empty())
+    return false;
+
+  unsigned GeneratedIntrinsics = 0;
+  auto *ConvergingI = Instructions[0];
+
+  auto TTI = TL->getTargetMachine().getTargetTransformInfo(
+      *ConvergingI->getFunction());
+  for (const auto &CN : CompositeNodes) {
+    auto *N = cast<Instruction>(CN->OutputNode);
+
+    // Wrangle the inputs
+
+    /// If the given value is part of a CompositeNode, and said node is part of
+    /// an accumulator chain, return the accumulator. Otherwise, returns the
+    /// "best fit" value (the ReplacementNode of a containing CompositeNode, or
+    /// the value itself)
+    auto FollowAccumulatorIfNecessary = [&](Value *V) -> Value * {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        return V;
+
+      auto CN = getContainingComposite(I);
+      if (!CN)
+        return I;
+
+      if (CN->Accumulatee)
+        CN = getContainingComposite(cast<Instruction>(CN->Accumulatee));
+
+      return CN->ReplacementNode;
+    };
+
+    /// Given a value and an operand index, get said operand and return it.
+    /// If the discovered operand is part of a composite node, return the
+    /// replacement instead.
+    auto GetInputFromOriginalInput = [&](Value *OriginalInput,
+                                         unsigned OpIdx) -> Value * {
+      auto *OriginalI = cast<Instruction>(OriginalInput);
+      if (OriginalI->getOpcode() == Instruction::FNeg)
+        OpIdx = 0;
+
+      auto *Op = OriginalI->getOperand(OpIdx);
+      if (auto *SVI = dyn_cast<ShuffleVectorInst>(Op))
+        Op = SVI->getOperand(0);
+
+      if (!Op)
+        return nullptr;
+
+      if (auto *I = dyn_cast<Instruction>(Op)) {
+        if (auto Containing = getContainingComposite(I)) {
+          if (Containing->ReplacementNode)
+            return Containing->ReplacementNode;
+        }
+      }
+      return Op;
+    };
+
+    if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) {
+      Value *Sub = nullptr;
+      if (auto *Op0 = dyn_cast<Instruction>(CN->OriginalInput0)) {
+        if (Op0->getOpcode() == Instruction::FSub)
+          Sub = Op0;
+      }
+      if (!Sub) {
+        if (auto *Op1 = dyn_cast<Instruction>(CN->OriginalInput1)) {
+          if (Op1->getOpcode() == Instruction::FSub)
+            Sub = Op1;
+        }
+      }
+
+      if (!Sub)
+        return false;
+
+      CN->Input0 =
+          FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 0));
+      CN->Input1 =
+          FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 1));
+    } else {
+      CN->Input0 = FollowAccumulatorIfNecessary(
+          GetInputFromOriginalInput(CN->OriginalInput0, 0));
+      CN->Input1 = FollowAccumulatorIfNecessary(
+          GetInputFromOriginalInput(CN->OriginalInput1, 0));
+
+      if (CN->OriginalInput0 != CN->OriginalInput1 && CN->Input0 == CN->Input1)
+        CN->Input1 = FollowAccumulatorIfNecessary(
+            GetInputFromOriginalInput(CN->OriginalInput1, 1));
+    }
+
+    if (CN->Input0 == nullptr || CN->Input1 == nullptr)
+      continue;
+
+    if (CN->Accumulator) {
+      if (auto Node =
+              getContainingComposite(cast<Instruction>(CN->Accumulator)))
+        CN->Accumulator = cast<Instruction>(Node->ReplacementNode);
+    }
+
+    if (CN->Operation == llvm::ComplexDeinterleavingOperation::CMulPartial &&
+        CN->Accumulator) {
+      if (auto Node =
+              getContainingComposite(cast<Instruction>(CN->Accumulator))) {
+        bool Valid90 = (Node->Rotation == 0 && CN->Rotation == 90) ||
+                       (Node->Rotation == 90 && CN->Rotation == 0);
+        bool Valid270 = (Node->Rotation == 180 && CN->Rotation == 270) ||
+                        (Node->Rotation == 270 && CN->Rotation == 180);
+        if (!Valid90 && !Valid270) {
+          LLVM_DEBUG(dbgs() << "Invalid rotation pairs.\n");
+          return false;
+        }
+
+        CN->Input0 = Node->Input0;
+        CN->Input1 = Node->Input1;
+      }
+    }
+
+    CN->ReplacementNode = TL->createComplexDeinterleavingIR(
+        N, CN->Operation, CN->Rotation, CN->Input0, CN->Input1,
+        CN->Accumulator);
+    if (!CN->ReplacementNode) {
+      LLVM_DEBUG(dbgs() << "Target failed to create Intrinsic call.\n");
+      return false;
+    }
+
+    cast<Instruction>(CN->ReplacementNode)
+        ->moveAfter(cast<Instruction>(CN->OutputNode));
+
+    CostOfIntrinsics += TTI.getInstructionCost(
+        cast<Instruction>(CN->ReplacementNode), CostKind);
+    GeneratedIntrinsics += 1;
+  }
+
+  auto *R = getFinalInputReplacement(ConvergingI);
+  if (!R) {
+    LLVM_DEBUG(dbgs() << "Unable to find Final Input Replacement.\n");
+    return false;
+  }
+
+  InstructionCost CostOfNodes;
+  for (const auto &I : Instructions)
+    CostOfNodes += TTI.getInstructionCost(I, CostKind);
+
+  LLVM_DEBUG(dbgs() << "Evaluating cost of each graph.  Instructions: "
+                    << CostOfNodes << ", Intrinsics: " << CostOfIntrinsics
+                    << ".\n");
+  if (CostOfIntrinsics > CostOfNodes) {
+    LLVM_DEBUG(dbgs() << "Not replacing, cost was too high.\n");
+    return false;
+  }
+
+  ConvergingI->replaceAllUsesWith(R);
+
+  NumComplexIntrinsics += GeneratedIntrinsics;
+
+  return true;
+}
+
+void ComplexDeinterleavingGraph::getDeadRoots(
+    SmallVector<Instruction *> &DeadInstrRoots) {
+  for (const auto &CN : CompositeNodes) {
+    if (auto *I = dyn_cast_or_null<Instruction>(CN->ReplacementNode))
+      DeadInstrRoots.push_back(I);
+  }
+}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -740,6 +740,15 @@
 
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
+    bool isComplexDeinterleavingSupported() const override;
+    bool isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation Operation, Type *Ty) const override;
+
+    Value *createComplexDeinterleavingIR(
+        Instruction *I, ComplexDeinterleavingOperation OperationType,
+        unsigned Rotation, Value *InputA, Value *InputB,
+        Value *Accumulator = nullptr) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21727,3 +21727,77 @@
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
+
+bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
+  return Subtarget->hasMVEFloatOps();
+}
+
+bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
+    ComplexDeinterleavingOperation Operation, Type *Ty) const {
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VTy)
+    return false;
+
+  if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128)
+    return false;
+
+  // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
+  auto *ScalarTy = VTy->getScalarType();
+  if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
+    return true;
+
+  return false;
+}
+
+Value *ARMTargetLowering::createComplexDeinterleavingIR(
+    Instruction *I, ComplexDeinterleavingOperation OperationType,
+    unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const {
+
+  IRBuilder<> B(I);
+  auto *IntTy = Type::getInt32Ty(B.getContext());
+  auto *Ty = InputA->getType();
+
+  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+
+    ConstantInt *ConstMulRot = nullptr;
+
+    if (Rotation == 0)
+      ConstMulRot = ConstantInt::get(IntTy, 0);
+    else if (Rotation == 90)
+      ConstMulRot = ConstantInt::get(IntTy, 1);
+    else if (Rotation == 180)
+      ConstMulRot = ConstantInt::get(IntTy, 2);
+    else if (Rotation == 270)
+      ConstMulRot = ConstantInt::get(IntTy, 3);
+
+    if (!ConstMulRot)
+      return nullptr;
+
+    if (Accumulator)
+      return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
+                               {ConstMulRot, Accumulator, InputB, InputA});
+    return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
+                             {ConstMulRot, InputB, InputA});
+  }
+
+  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+
+    // 1 means the value is not halved.
+    unsigned HalvingVal = 1;
+    auto *Halving = ConstantInt::get(IntTy, HalvingVal);
+
+    unsigned RotKey;
+    if (Rotation == 90)
+      RotKey = 0;
+    else if (Rotation == 270)
+      RotKey = 1;
+    else
+      return nullptr; // Invalid rotation for arm_mve_vcaddq
+
+    auto *RotVal = ConstantInt::get(IntTy, RotKey);
+    return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
+                             {Halving, RotVal, InputA, InputB});
+  }
+
+  return nullptr;
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -426,12 +426,17 @@
   TargetPassConfig::addIRPasses();
 
   // Run the parallel DSP pass.
-  if (getOptLevel() == CodeGenOpt::Aggressive) 
+  if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createARMParallelDSPPass());
 
+  // Match complex arithmetic patterns
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(createComplexDeinterleavingPass(TM));
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+  }
 
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll
@@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+
+define arm_aapcs_vfpcc <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_add_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vadd.f16 s2, s2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vsub.f16 s0, s4, s0
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x half> %b.real, %a.imag
+  %1 = fadd fast <1 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+define arm_aapcs_vfpcc <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_add_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vadd.f16 q3, q3, q0
+; CHECK-NEXT:    vsub.f16 q0, q1, q2
+; CHECK-NEXT:    vmovx.f16 s1, s0
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vins.f16 s1, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x half> %b.real, %a.imag
+  %1 = fadd fast <2 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_add_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f16 q0, q1, q0, #90
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x half> %b.real, %a.imag
+  %1 = fadd fast <4 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_add_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vins.f16 s16, s1
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmovx.f16 s18, s9
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vmovx.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s20, s18
+; CHECK-NEXT:    vmovx.f16 s21, s10
+; CHECK-NEXT:    vmovx.f16 s18, s11
+; CHECK-NEXT:    vmovx.f16 s22, s12
+; CHECK-NEXT:    vmovx.f16 s24, s13
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s3, s5
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmov.f32 s19, s6
+; CHECK-NEXT:    vins.f16 s21, s18
+; CHECK-NEXT:    vmov.f32 s18, s4
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s23, s14
+; CHECK-NEXT:    vmovx.f16 s24, s15
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vmovx.f16 s3, s6
+; CHECK-NEXT:    vins.f16 s10, s11
+; CHECK-NEXT:    vins.f16 s14, s15
+; CHECK-NEXT:    vins.f16 s12, s13
+; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vins.f16 s18, s5
+; CHECK-NEXT:    vins.f16 s19, s7
+; CHECK-NEXT:    vins.f16 s23, s24
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vadd.f16 q4, q5, q4
+; CHECK-NEXT:    vsub.f16 q2, q2, q0
+; CHECK-NEXT:    vmovx.f16 s0, s16
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s5, s10
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s18
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s0, s17
+; CHECK-NEXT:    vmovx.f16 s7, s11
+; CHECK-NEXT:    vins.f16 s11, s19
+; CHECK-NEXT:    vins.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vins.f16 s10, s18
+; CHECK-NEXT:    vins.f16 s9, s17
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s2, s9
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmov.f32 s3, s12
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x half> %b.real, %a.imag
+  %1 = fadd fast <8 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_add_v32f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vmovx.f16 s18, s1
+; CHECK-NEXT:    add r0, sp, #80
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vmovx.f16 s17, s2
+; CHECK-NEXT:    vmovx.f16 s18, s3
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vins.f16 s17, s18
+; CHECK-NEXT:    vmovx.f16 s18, s4
+; CHECK-NEXT:    vmovx.f16 s22, s5
+; CHECK-NEXT:    vmovx.f16 s19, s6
+; CHECK-NEXT:    vmovx.f16 s23, s7
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s1, s25
+; CHECK-NEXT:    vmovx.f16 s24, s24
+; CHECK-NEXT:    vmov.f32 s21, s26
+; CHECK-NEXT:    vins.f16 s20, s25
+; CHECK-NEXT:    vins.f16 s18, s22
+; CHECK-NEXT:    vmov.f32 s22, s28
+; CHECK-NEXT:    vins.f16 s19, s23
+; CHECK-NEXT:    vmov.f32 s23, s30
+; CHECK-NEXT:    vins.f16 s24, s1
+; CHECK-NEXT:    vmovx.f16 s25, s26
+; CHECK-NEXT:    vmovx.f16 s1, s27
+; CHECK-NEXT:    vins.f16 s21, s27
+; CHECK-NEXT:    vins.f16 s25, s1
+; CHECK-NEXT:    vmovx.f16 s26, s28
+; CHECK-NEXT:    vmovx.f16 s1, s29
+; CHECK-NEXT:    vins.f16 s22, s29
+; CHECK-NEXT:    vins.f16 s23, s31
+; CHECK-NEXT:    add r0, sp, #112
+; CHECK-NEXT:    vins.f16 s26, s1
+; CHECK-NEXT:    vmovx.f16 s1, s31
+; CHECK-NEXT:    vmovx.f16 s27, s30
+; CHECK-NEXT:    vsub.f16 q4, q5, q4
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vins.f16 s27, s1
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmovx.f16 s28, s8
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmovx.f16 s6, s23
+; CHECK-NEXT:    vadd.f16 q0, q6, q0
+; CHECK-NEXT:    vmovx.f16 s27, s22
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vins.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vins.f16 s27, s6
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s5, s18
+; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s18, s2
+; CHECK-NEXT:    vmovx.f16 s26, s20
+; CHECK-NEXT:    vmovx.f16 s2, s21
+; CHECK-NEXT:    vins.f16 s28, s6
+; CHECK-NEXT:    vmovx.f16 s29, s10
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vins.f16 s26, s2
+; CHECK-NEXT:    vmov.f32 s2, s14
+; CHECK-NEXT:    vins.f16 s29, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmovx.f16 s30, s12
+; CHECK-NEXT:    add r0, sp, #96
+; CHECK-NEXT:    vins.f16 s30, s6
+; CHECK-NEXT:    vins.f16 s0, s13
+; CHECK-NEXT:    vins.f16 s2, s15
+; CHECK-NEXT:    vmovx.f16 s6, s15
+; CHECK-NEXT:    vmovx.f16 s31, s14
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vins.f16 s31, s6
+; CHECK-NEXT:    vins.f16 s10, s11
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmovx.f16 s25, s14
+; CHECK-NEXT:    vins.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s6, s15
+; CHECK-NEXT:    vins.f16 s14, s15
+; CHECK-NEXT:    vins.f16 s22, s23
+; CHECK-NEXT:    vins.f16 s20, s21
+; CHECK-NEXT:    vins.f16 s12, s13
+; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vins.f16 s25, s6
+; CHECK-NEXT:    vmov.f32 s13, s14
+; CHECK-NEXT:    vmovx.f16 s7, s19
+; CHECK-NEXT:    vmov.f32 s10, s0
+; CHECK-NEXT:    vins.f16 s19, s3
+; CHECK-NEXT:    vmov.f32 s11, s2
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vadd.f16 q2, q6, q2
+; CHECK-NEXT:    vmov.f32 s15, s22
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vsub.f16 q5, q3, q7
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vmovx.f16 s6, s20
+; CHECK-NEXT:    vmovx.f16 s13, s22
+; CHECK-NEXT:    vins.f16 s6, s0
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s24, s0
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vmovx.f16 s26, s21
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vins.f16 s22, s10
+; CHECK-NEXT:    vmovx.f16 s15, s23
+; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vins.f16 s26, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vins.f16 s20, s8
+; CHECK-NEXT:    vins.f16 s21, s9
+; CHECK-NEXT:    vins.f16 s17, s1
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vins.f16 s15, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s4, s18
+; CHECK-NEXT:    vmov.f32 s12, s22
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmov.f32 s10, s21
+; CHECK-NEXT:    vmov.f32 s14, s23
+; CHECK-NEXT:    vmov.f32 s3, s24
+; CHECK-NEXT:    vmov.f32 s11, s26
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fsub fast <16 x half> %b.real, %a.imag
+  %1 = fadd fast <16 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define arm_aapcs_vfpcc <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_mul_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmul.f16 s6, s2, s0
+; CHECK-NEXT:    vfma.f16 s6, s4, s8
+; CHECK-NEXT:    vmul.f16 s8, s8, s2
+; CHECK-NEXT:    vfnms.f16 s8, s4, s0
+; CHECK-NEXT:    vins.f16 s8, s6
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x half> %b.imag, %a.real
+  %1 = fmul fast <1 x half> %b.real, %a.imag
+  %2 = fadd fast <1 x half> %1, %0
+  %3 = fmul fast <1 x half> %b.real, %a.real
+  %4 = fmul fast <1 x half> %a.imag, %b.imag
+  %5 = fsub fast <1 x half> %3, %4
+  %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_mul_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vmul.f16 q4, q3, q0
+; CHECK-NEXT:    vfma.f16 q4, q1, q2
+; CHECK-NEXT:    vmul.f16 q2, q2, q3
+; CHECK-NEXT:    vneg.f16 q2, q2
+; CHECK-NEXT:    vfma.f16 q2, q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s16
+; CHECK-NEXT:    vmovx.f16 s9, s8
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vins.f16 s9, s0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x half> %b.imag, %a.real
+  %1 = fmul fast <2 x half> %b.real, %a.imag
+  %2 = fadd fast <2 x half> %1, %0
+  %3 = fmul fast <2 x half> %b.real, %a.real
+  %4 = fmul fast <2 x half> %a.imag, %b.imag
+  %5 = fsub fast <2 x half> %3, %4
+  %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_mul_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f16 q2, q0, q1, #90
+; CHECK-NEXT:    vcmla.f16 q2, q0, q1, #0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x half> %b.imag, %a.real
+  %1 = fmul fast <4 x half> %b.real, %a.imag
+  %2 = fadd fast <4 x half> %1, %0
+  %3 = fmul fast <4 x half> %b.real, %a.real
+  %4 = fmul fast <4 x half> %a.imag, %b.imag
+  %5 = fsub fast <4 x half> %3, %4
+  %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_mul_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmovx.f16 s19, s6
+; CHECK-NEXT:    vmovx.f16 s24, s7
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s18, s1
+; CHECK-NEXT:    vins.f16 s19, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmovx.f16 s25, s10
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vmovx.f16 s17, s2
+; CHECK-NEXT:    vmovx.f16 s18, s3
+; CHECK-NEXT:    vins.f16 s25, s8
+; CHECK-NEXT:    vmovx.f16 s8, s13
+; CHECK-NEXT:    vmovx.f16 s26, s12
+; CHECK-NEXT:    vins.f16 s17, s18
+; CHECK-NEXT:    vmovx.f16 s18, s4
+; CHECK-NEXT:    vmovx.f16 s22, s5
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s1, s9
+; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vmovx.f16 s8, s15
+; CHECK-NEXT:    vmovx.f16 s27, s14
+; CHECK-NEXT:    vins.f16 s18, s22
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s24, s1
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vins.f16 s27, s8
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s22, s12
+; CHECK-NEXT:    vins.f16 s20, s9
+; CHECK-NEXT:    vmov.f32 s23, s14
+; CHECK-NEXT:    vins.f16 s21, s11
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vins.f16 s22, s13
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vins.f16 s23, s15
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmul.f16 q1, q4, q6
+; CHECK-NEXT:    vmul.f16 q2, q6, q0
+; CHECK-NEXT:    vneg.f16 q3, q1
+; CHECK-NEXT:    vfma.f16 q3, q5, q0
+; CHECK-NEXT:    vfma.f16 q2, q5, q4
+; CHECK-NEXT:    vmovx.f16 s4, s12
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s5, s14
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s8, s13
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmovx.f16 s7, s15
+; CHECK-NEXT:    vins.f16 s15, s11
+; CHECK-NEXT:    vins.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vins.f16 s14, s10
+; CHECK-NEXT:    vins.f16 s13, s9
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s4, s14
+; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vmov.f32 s3, s8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x half> %b.imag, %a.real
+  %1 = fmul fast <8 x half> %b.real, %a.imag
+  %2 = fadd fast <8 x half> %1, %0
+  %3 = fmul fast <8 x half> %b.real, %a.real
+  %4 = fmul fast <8 x half> %a.imag, %b.imag
+  %5 = fsub fast <8 x half> %3, %4
+  %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_mul_v32f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    add r0, sp, #96
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    add r0, sp, #112
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmovx.f16 s16, s24
+; CHECK-NEXT:    vmovx.f16 s18, s25
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vmovx.f16 s17, s26
+; CHECK-NEXT:    vmovx.f16 s18, s27
+; CHECK-NEXT:    vmovx.f16 s19, s29
+; CHECK-NEXT:    vins.f16 s17, s18
+; CHECK-NEXT:    vmovx.f16 s18, s28
+; CHECK-NEXT:    vins.f16 s18, s19
+; CHECK-NEXT:    vmovx.f16 s19, s30
+; CHECK-NEXT:    vmovx.f16 s8, s31
+; CHECK-NEXT:    vmov.f32 s20, s0
+; CHECK-NEXT:    vins.f16 s19, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s21, s2
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vins.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vins.f16 s21, s3
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vins.f16 s22, s5
+; CHECK-NEXT:    vins.f16 s23, s7
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vmovx.f16 s3, s6
+; CHECK-NEXT:    vins.f16 s26, s27
+; CHECK-NEXT:    vins.f16 s30, s31
+; CHECK-NEXT:    vins.f16 s28, s29
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s24, s25
+; CHECK-NEXT:    vmov.f32 s25, s26
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmul.f16 q2, q4, q5
+; CHECK-NEXT:    vmov.f32 s26, s28
+; CHECK-NEXT:    add r0, sp, #128
+; CHECK-NEXT:    vmov.f32 s27, s30
+; CHECK-NEXT:    vfma.f16 q2, q6, q0
+; CHECK-NEXT:    vmul.f16 q0, q0, q4
+; CHECK-NEXT:    vneg.f16 q4, q0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vfma.f16 q4, q6, q5
+; CHECK-NEXT:    vmovx.f16 s20, s12
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vmovx.f16 s5, s18
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s13
+; CHECK-NEXT:    vins.f16 s16, s8
+; CHECK-NEXT:    vins.f16 s18, s10
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s20, s0
+; CHECK-NEXT:    vmovx.f16 s21, s14
+; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s21, s0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    add r0, sp, #144
+; CHECK-NEXT:    vins.f16 s22, s6
+; CHECK-NEXT:    vmovx.f16 s23, s10
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vmov.f32 s24, s0
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    vins.f16 s23, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vins.f16 s24, s1
+; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vmovx.f16 s6, s3
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vmov.f32 s25, s2
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s6, s29
+; CHECK-NEXT:    vmovx.f16 s2, s28
+; CHECK-NEXT:    vins.f16 s25, s3
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s10, s11
+; CHECK-NEXT:    vins.f16 s14, s15
+; CHECK-NEXT:    vmovx.f16 s6, s31
+; CHECK-NEXT:    vmovx.f16 s3, s30
+; CHECK-NEXT:    vmov.f32 s26, s28
+; CHECK-NEXT:    vmov.f32 s27, s30
+; CHECK-NEXT:    vins.f16 s12, s13
+; CHECK-NEXT:    vmov.f32 s13, s14
+; CHECK-NEXT:    vins.f16 s3, s6
+; CHECK-NEXT:    vmov.f32 s14, s8
+; CHECK-NEXT:    vins.f16 s26, s29
+; CHECK-NEXT:    vmov.f32 s15, s10
+; CHECK-NEXT:    vins.f16 s27, s31
+; CHECK-NEXT:    vmul.f16 q7, q0, q3
+; CHECK-NEXT:    vmul.f16 q0, q5, q0
+; CHECK-NEXT:    vfma.f16 q7, q6, q5
+; CHECK-NEXT:    vneg.f16 q5, q0
+; CHECK-NEXT:    vfma.f16 q5, q6, q3
+; CHECK-NEXT:    vmovx.f16 s0, s28
+; CHECK-NEXT:    vmovx.f16 s6, s20
+; CHECK-NEXT:    vmovx.f16 s13, s22
+; CHECK-NEXT:    vins.f16 s6, s0
+; CHECK-NEXT:    vmovx.f16 s0, s30
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vmovx.f16 s7, s19
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s26, s21
+; CHECK-NEXT:    vins.f16 s24, s0
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s0, s29
+; CHECK-NEXT:    vins.f16 s22, s30
+; CHECK-NEXT:    vmovx.f16 s15, s23
+; CHECK-NEXT:    vins.f16 s23, s31
+; CHECK-NEXT:    vins.f16 s26, s0
+; CHECK-NEXT:    vmovx.f16 s0, s31
+; CHECK-NEXT:    vins.f16 s20, s28
+; CHECK-NEXT:    vins.f16 s21, s29
+; CHECK-NEXT:    vins.f16 s17, s1
+; CHECK-NEXT:    vins.f16 s19, s3
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vins.f16 s15, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s4, s18
+; CHECK-NEXT:    vmov.f32 s12, s22
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmov.f32 s10, s21
+; CHECK-NEXT:    vmov.f32 s14, s23
+; CHECK-NEXT:    vmov.f32 s3, s24
+; CHECK-NEXT:    vmov.f32 s11, s26
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fmul fast <16 x half> %b.imag, %a.real
+  %1 = fmul fast <16 x half> %b.real, %a.imag
+  %2 = fadd fast <16 x half> %1, %0
+  %3 = fmul fast <16 x half> %b.real, %a.real
+  %4 = fmul fast <16 x half> %a.imag, %b.imag
+  %5 = fsub fast <16 x half> %3, %4
+  %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+
+define arm_aapcs_vfpcc <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_add_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f32 s5, s5, s0
+; CHECK-NEXT:    vsub.f32 s4, s4, s1
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x float> %b.real, %a.imag
+  %1 = fadd fast <1 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_add_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f32 q2, q1, q0, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %b.real, %a.imag
+  %1 = fadd fast <2 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_add_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s7, s18
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vadd.f32 q1, q5, q1
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vsub.f32 q2, q2, q0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s0, s8
+; CHECK-NEXT:    vmov.f32 s2, s9
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x float> %b.real, %a.imag
+  %1 = fadd fast <4 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+define arm_aapcs_vfpcc <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_add_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add r2, sp, #64
+; CHECK-NEXT:    add r3, sp, #80
+; CHECK-NEXT:    vldrw.u32 q5, [r3]
+; CHECK-NEXT:    vldrw.u32 q6, [r2]
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    add r0, sp, #96
+; CHECK-NEXT:    vmov.f32 s28, s25
+; CHECK-NEXT:    add r1, sp, #112
+; CHECK-NEXT:    vmov.f32 s29, s27
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s25, s26
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov.f32 s26, s20
+; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s7, s18
+; CHECK-NEXT:    vsub.f32 q4, q6, q0
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vmov.f32 s31, s23
+; CHECK-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-NEXT:    vadd.f32 q1, q7, q1
+; CHECK-NEXT:    vmov.f32 s0, s16
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s4, s18
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vmov.f32 s17, s10
+; CHECK-NEXT:    vmov.f32 s28, s25
+; CHECK-NEXT:    vmov.f32 s29, s27
+; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s9, s11
+; CHECK-NEXT:    vmov.f32 s25, s26
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vmov.f32 s31, s23
+; CHECK-NEXT:    vmov.f32 s10, s13
+; CHECK-NEXT:    vadd.f32 q4, q7, q4
+; CHECK-NEXT:    vmov.f32 s11, s15
+; CHECK-NEXT:    vmov.f32 s26, s20
+; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vsub.f32 q3, q6, q2
+; CHECK-NEXT:    vmov.f32 s9, s16
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s8, s12
+; CHECK-NEXT:    vmov.f32 s10, s13
+; CHECK-NEXT:    vmov q3, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x float> %b.real, %a.imag
+  %1 = fadd fast <8 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define arm_aapcs_vfpcc <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_mul_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f32 s9, s5, s0
+; CHECK-NEXT:    vmul.f32 s8, s1, s5
+; CHECK-NEXT:    vfma.f32 s9, s4, s1
+; CHECK-NEXT:    vfnms.f32 s8, s4, s0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x float> %b.imag, %a.real
+  %1 = fmul fast <1 x float> %b.real, %a.imag
+  %2 = fadd fast <1 x float> %1, %0
+  %3 = fmul fast <1 x float> %b.real, %a.real
+  %4 = fmul fast <1 x float> %a.imag, %b.imag
+  %5 = fsub fast <1 x float> %3, %4
+  %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_mul_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #90
+; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %b.imag, %a.real
+  %1 = fmul fast <2 x float> %b.real, %a.imag
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %b.real, %a.real
+  %4 = fmul fast <2 x float> %a.imag, %b.imag
+  %5 = fsub fast <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_mul_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmov.f32 s20, s0
+; CHECK-NEXT:    vmov.f32 s21, s2
+; CHECK-NEXT:    vmov.f32 s24, s9
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s22, s16
+; CHECK-NEXT:    vmov.f32 s23, s18
+; CHECK-NEXT:    vmov.f32 s26, s13
+; CHECK-NEXT:    vmov.f32 s27, s15
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmul.f32 q1, q6, q5
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vfma.f32 q1, q2, q0
+; CHECK-NEXT:    vmul.f32 q0, q0, q6
+; CHECK-NEXT:    vneg.f32 q3, q0
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vfma.f32 q3, q2, q5
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vmov.f32 s4, s14
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %b.imag, %a.real
+  %1 = fmul fast <4 x float> %b.real, %a.imag
+  %2 = fadd fast <4 x float> %1, %0
+  %3 = fmul fast <4 x float> %b.real, %a.real
+  %4 = fmul fast <4 x float> %a.imag, %b.imag
+  %5 = fsub fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_mul_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    add r2, sp, #96
+; CHECK-NEXT:    add r3, sp, #112
+; CHECK-NEXT:    vldrw.u32 q5, [r3]
+; CHECK-NEXT:    vldrw.u32 q4, [r2]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s24, s1
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    add r0, sp, #144
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    add r1, sp, #128
+; CHECK-NEXT:    vmov.f32 s25, s3
+; CHECK-NEXT:    vmov.f32 s14, s21
+; CHECK-NEXT:    vmov.f32 s15, s23
+; CHECK-NEXT:    vmov.f32 s26, s5
+; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmul.f32 q7, q6, q3
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmul.f32 q1, q3, q0
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vmov.f32 s18, s20
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.f32 s19, s22
+; CHECK-NEXT:    vneg.f32 q5, q7
+; CHECK-NEXT:    vfma.f32 q5, q4, q0
+; CHECK-NEXT:    vfma.f32 q1, q4, q6
+; CHECK-NEXT:    vmov.f32 s0, s20
+; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s28, s25
+; CHECK-NEXT:    vmov.f32 s29, s27
+; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s9, s11
+; CHECK-NEXT:    vmov.f32 s30, s13
+; CHECK-NEXT:    vmov.f32 s31, s15
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s4, s22
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s23
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov.f32 s23, s2
+; CHECK-NEXT:    vmul.f32 q0, q2, q7
+; CHECK-NEXT:    vmov.f32 s25, s26
+; CHECK-NEXT:    vmul.f32 q4, q7, q5
+; CHECK-NEXT:    vmov.f32 s26, s12
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vmov.f32 s27, s14
+; CHECK-NEXT:    vfma.f32 q4, q6, q2
+; CHECK-NEXT:    vfma.f32 q0, q6, q5
+; CHECK-NEXT:    vmov.f32 s9, s16
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vmov.f32 s16, s2
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov q3, q4
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x float> %b.imag, %a.real
+  %1 = fmul fast <8 x float> %b.real, %a.imag
+  %2 = fadd fast <8 x float> %1, %0
+  %3 = fmul fast <8 x float> %b.real, %a.real
+  %4 = fmul fast <8 x float> %a.imag, %b.imag
+  %5 = fsub fast <8 x float> %3, %4
+  %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+
+define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_add_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f64 d3, d3, d0
+; CHECK-NEXT:    vsub.f64 d2, d2, d1
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x double> %b.real, %a.imag
+  %1 = fadd fast <1 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_add_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f64 d5, d5, d0
+; CHECK-NEXT:    vsub.f64 d4, d4, d1
+; CHECK-NEXT:    vadd.f64 d7, d7, d2
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vsub.f64 d6, d6, d3
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x double> %b.real, %a.imag
+  %1 = fadd fast <2 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_add_v8f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    add r0, sp, #48
+; CHECK-NEXT:    vadd.f64 d1, d1, d2
+; CHECK-NEXT:    vsub.f64 d0, d0, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vadd.f64 d3, d3, d8
+; CHECK-NEXT:    vsub.f64 d2, d2, d9
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #80
+; CHECK-NEXT:    vadd.f64 d9, d9, d4
+; CHECK-NEXT:    vsub.f64 d8, d8, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d11, d5, d6
+; CHECK-NEXT:    vsub.f64 d10, d4, d7
+; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x double> %b.real, %a.imag
+  %1 = fadd fast <4 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_mul_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f64 d5, d3, d0
+; CHECK-NEXT:    vmul.f64 d4, d1, d3
+; CHECK-NEXT:    vfma.f64 d5, d2, d1
+; CHECK-NEXT:    vfnms.f64 d4, d2, d0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x double> %b.imag, %a.real
+  %1 = fmul fast <1 x double> %b.real, %a.imag
+  %2 = fadd fast <1 x double> %1, %0
+  %3 = fmul fast <1 x double> %b.real, %a.real
+  %4 = fmul fast <1 x double> %a.imag, %b.imag
+  %5 = fsub fast <1 x double> %3, %4
+  %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_mul_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmul.f64 d9, d7, d2
+; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmul.f64 d8, d3, d7
+; CHECK-NEXT:    vfma.f64 d9, d6, d3
+; CHECK-NEXT:    vfnms.f64 d8, d6, d2
+; CHECK-NEXT:    vmul.f64 d1, d5, d10
+; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmul.f64 d0, d11, d5
+; CHECK-NEXT:    vfma.f64 d1, d4, d11
+; CHECK-NEXT:    vfnms.f64 d0, d4, d10
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x double> %b.imag, %a.real
+  %1 = fmul fast <2 x double> %b.real, %a.imag
+  %2 = fadd fast <2 x double> %1, %0
+  %3 = fmul fast <2 x double> %b.real, %a.real
+  %4 = fmul fast <2 x double> %a.imag, %b.imag
+  %5 = fsub fast <2 x double> %3, %4
+  %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_mul_v8f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    add r0, sp, #128
+; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #160
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov q6, q0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    add r0, sp, #176
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d5, d3, d0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d4, d1, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    add r0, sp, #144
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d11, d3, d0
+; CHECK-NEXT:    vmul.f64 d10, d1, d3
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmul.f64 d7, d9, d12
+; CHECK-NEXT:    vmul.f64 d2, d15, d1
+; CHECK-NEXT:    vmul.f64 d3, d1, d14
+; CHECK-NEXT:    vmul.f64 d6, d13, d9
+; CHECK-NEXT:    vfma.f64 d7, d8, d13
+; CHECK-NEXT:    vfnms.f64 d6, d8, d12
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d3, d0, d15
+; CHECK-NEXT:    vfnms.f64 d2, d0, d14
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d5, d0, d9
+; CHECK-NEXT:    vfnms.f64 d4, d0, d8
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d11, d0, d9
+; CHECK-NEXT:    vfnms.f64 d10, d0, d8
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x double> %b.imag, %a.real
+  %1 = fmul fast <4 x double> %b.real, %a.imag
+  %2 = fadd fast <4 x double> %1, %0
+  %3 = fmul fast <4 x double> %b.real, %a.real
+  %4 = fmul fast <4 x double> %a.imag, %b.imag
+  %5 = fsub fast <4 x double> %3, %4
+  %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Transform functions to use DSP intrinsics
+; CHECK-NEXT:      Complex Arithmetic Pass
 ; CHECK-NEXT:      Interleaved Access Pass
 ; CHECK-NEXT:      Type Promotion
 ; CHECK-NEXT:      CodeGen Prepare