diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -43,6 +43,7 @@
 class DominatorTree;
 class BranchInst;
 class CallBase;
+class ComplexArithmeticGraph;
 class Function;
 class GlobalValue;
 class InstCombiner;
@@ -762,6 +763,13 @@
   /// the scalarization cost of a load/store.
   bool supportsEfficientVectorElementLoadStore() const;
 
+  bool supportsComplexArithmetic() const;
+
+  Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA,
+                                   Value *InputB, int &GeneratedIntrinsicCount);
+
+  bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G);
+
   /// Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
@@ -1582,6 +1590,12 @@
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                    ArrayRef<Type *> Tys) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
+  virtual bool supportsComplexArithmetic() const = 0;
+  virtual Value *createComplexArithmeticIR(ComplexArithmeticGraph &G,
+                                           Value *InputA, Value *InputB,
+                                           int &GeneratedIntrinsicCount) = 0;
+  virtual bool matchComplexArithmeticIR(Instruction *I,
+                                        ComplexArithmeticGraph &G) = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual MemCmpExpansionOptions
   enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
@@ -2030,6 +2044,22 @@
     return Impl.supportsEfficientVectorElementLoadStore();
   }
 
+  bool supportsComplexArithmetic() const override {
+    return Impl.supportsComplexArithmetic();
+  }
+
+  Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA,
+                                   Value *InputB,
+                                   int &GeneratedIntrinsicCount) override {
+    return Impl.createComplexArithmeticIR(G, InputA, InputB,
+                                          GeneratedIntrinsicCount);
+  }
+
+  bool matchComplexArithmeticIR(Instruction *I,
+                                ComplexArithmeticGraph &G) override {
+    return Impl.matchComplexArithmeticIR(I, G);
+  }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -334,6 +334,16 @@
 
   bool supportsEfficientVectorElementLoadStore() const { return false; }
 
+  bool supportsComplexArithmetic() const { return false; }
+  Value *createComplexArithmeticIR(ComplexArithmeticGraph &, Value *, Value *,
+                                   int &GeneratedIntrinsicCount) {
+    return nullptr;
+  }
+
+  bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G) {
+    return false;
+  }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) const {
     return false;
   }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -114,9 +114,10 @@
 void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
 void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPreparePass(PassRegistry&);
-void initializeConstantHoistingLegacyPassPass(PassRegistry&);
-void initializeConstantMergeLegacyPassPass(PassRegistry&);
+void initializeCodeGenPreparePass(PassRegistry &);
+void initializeComplexArithmeticLegacyPassPass(PassRegistry &);
+void initializeConstantHoistingLegacyPassPass(PassRegistry &);
+void initializeConstantMergeLegacyPassPass(PassRegistry &);
 void initializeConstraintEliminationPass(PassRegistry &);
 void initializeControlHeightReductionLegacyPassPass(PassRegistry&);
 void initializeCorrelatedValuePropagationPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -555,13 +555,20 @@
 //
 FunctionPass *createInstSimplifyLegacyPass();
 
-
 //===----------------------------------------------------------------------===//
 //
 // createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
 // and scatter intrinsics with scalar code when target doesn't support them.
 //
 FunctionPass *createScalarizeMaskedMemIntrinLegacyPass();
-} // End llvm namespace
+
+//===----------------------------------------------------------------------===//
+//
+// This pass implements generation of target-specific intrinsics to support
+// handling of complex number arithmetic
+//
+FunctionPass *createComplexArithmeticPass();
+
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h b/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h
@@ -0,0 +1,252 @@
+//===- ComplexArithmetic.h - Complex Arithmetic Pass --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements generation of target-specific intrinsics to support
+// handling of complex number arithmetic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H
+#define LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+
+namespace llvm {
+
+class Function;
+
+struct ComplexArithmeticPass : public PassInfoMixin<ComplexArithmeticPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/**
+ * Annotated graph-like structure that enriches the existing Instruction graph,
+ * allowing for contextual clues relevant to complex arithmetic to be provided
+ * and given to TTI hooks as required.
+ */
+class ComplexArithmeticGraph {
+public:
+  /**
+   * Bitflags denoting the type of the instruction node.
+   */
+  enum NodeType {
+    Unknown = 0,
+    // Actual node types
+
+    Real = 1,
+    Imaginary = 2,
+    Load = 4,
+    Store = 8,
+    Shuffle = 16,
+    AddOperand = 32,
+
+    // Meta node types, defining additional behaviour upon node creation
+
+    /**
+     * Will cause the node to look at parents to try and identify the type.
+     * Parents must already be registered and identified.
+     */
+    Discover = 0xffff,
+  };
+
+  enum GraphType { Complex_Mul, Complex_Add };
+
+  struct Node {
+  public:
+    Instruction *I;
+    NodeType NType;
+
+    Node(Instruction *i, enum NodeType nodeType) : I(i), NType(nodeType) {}
+  };
+
+  /**
+   * Returns a copy of the vector of all registered nodes.
+   */
+  SmallVector<Instruction *> getAllNodes() {
+    SmallVector<Instruction *> Is;
+    for (auto &N : Nodes)
+      Is.push_back(N->I);
+    return Is;
+  }
+
+  /**
+   * Returns a vector of all registered nodes that are of the given type.
+   */
+  SmallVector<Instruction *> getNodesOfType(enum NodeType Type) {
+    SmallVector<Instruction *> Is;
+    for (auto &N : Nodes) {
+      if ((N->NType & Type) == Type)
+        Is.push_back(N->I);
+    }
+    return Is;
+  }
+
+  /**
+   * Returns the node type of I. It must already be registered and identified,
+   * otherwise `Unknown` is returned.
+   */
+  enum NodeType getNodeType(Instruction *I) {
+    auto *N = getNode(I);
+    if (N == nullptr)
+      return Unknown;
+    return N->NType;
+  }
+
+  /**
+   * Registers and identifies the given Instruction, optionally with the
+   * provided NodeType.
+   */
+  void addNode(Instruction *I, enum NodeType NodeType = NodeType::Unknown) {
+    if ((NodeType & Discover) == Discover) {
+      auto LeftType = getNodeType(cast<Instruction>(I->getOperand(0)));
+      auto RightType = getNodeType(cast<Instruction>(I->getOperand(1)));
+
+      if (LeftType == Unknown || RightType == Unknown) {
+        NodeType = Unknown;
+      } else {
+        if (I->getOpcode() == Instruction::FMul) {
+          if (LeftType == RightType)
+            NodeType = Real;
+          else
+            NodeType = Imaginary;
+        } else {
+          NodeType = LeftType;
+        }
+      }
+    }
+
+    auto *Existing = getNode(I);
+    if (Existing != nullptr) {
+      if (Existing->NType == NodeType)
+        return;
+      llvm_unreachable(
+          "A node has been added twice, with conflicting nodetypes.");
+    }
+
+    auto N = std::make_unique<Node>(I, NodeType);
+    Nodes.push_back(std::move(N));
+  }
+
+  LLVMContext &getContext() { return CurrentI->getContext(); }
+
+  Instruction *getCurrentInstruction() { return CurrentI; }
+
+  void setCurrentInstruction(Instruction *I) { CurrentI = I; }
+
+  void setType(enum GraphType type) { GType = type; }
+
+  enum GraphType getType() { return GType; }
+
+  void setRotation(unsigned R) { Rotation = R; }
+
+  unsigned getRotation() { return Rotation; }
+
+  /**
+   * Sets the graph userdata pointer. The graph then assumes ownership of the
+   * pointer, and will free it on deconstruction.
+   */
+  template <typename T> void setUserData(T *Ptr) {
+    UserData = std::shared_ptr<T>(Ptr);
+  }
+
+  /**
+   * Gets the graph userdata pointer, casting it to T.
+   *
+   * Note: No checks are made by the graph to ensure the type of the data is as
+   * requested. It is up to the caller to check for that.
+   */
+  template <typename T> T *getUserData() {
+    if (UserData == nullptr)
+      return nullptr;
+    auto *Ptr = UserData.get();
+    return (T *)Ptr;
+  }
+
+private:
+  unsigned Rotation;
+  GraphType GType;
+  Instruction *CurrentI = nullptr;
+  // std::unique_ptr doesn't support void* without an explicit deleter
+  std::shared_ptr<void> UserData;
+
+  Node *getNode(Instruction *I) {
+    for (const auto &item : Nodes) {
+      if (item->I == I)
+        return item.get();
+    }
+    return nullptr;
+  }
+
+  SmallVector<std::unique_ptr<Node>> Nodes;
+};
+
+inline ComplexArithmeticGraph::NodeType
+operator~(ComplexArithmeticGraph::NodeType a) {
+  return (ComplexArithmeticGraph::NodeType) ~(int)a;
+}
+inline ComplexArithmeticGraph::NodeType
+operator|(ComplexArithmeticGraph::NodeType a,
+          ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType)((int)a | (int)b);
+}
+inline ComplexArithmeticGraph::NodeType
+operator&(ComplexArithmeticGraph::NodeType a,
+          ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType)((int)a & (int)b);
+}
+inline ComplexArithmeticGraph::NodeType
+operator^(ComplexArithmeticGraph::NodeType a,
+          ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType)((int)a ^ (int)b);
+}
+inline ComplexArithmeticGraph::NodeType &
+operator|=(ComplexArithmeticGraph::NodeType &a,
+           ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType &)((int &)a |= (int)b);
+}
+inline ComplexArithmeticGraph::NodeType &
+operator&=(ComplexArithmeticGraph::NodeType &a,
+           ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType &)((int &)a &= (int)b);
+}
+inline ComplexArithmeticGraph::NodeType &
+operator^=(ComplexArithmeticGraph::NodeType &a,
+           ComplexArithmeticGraph::NodeType b) {
+  return (ComplexArithmeticGraph::NodeType &)((int &)a ^= (int)b);
+}
+
+/**
+ * Creates a contiguous mask of the given length, optionally with a base offset.
+ */
+static ArrayRef<int> createContiguousMask(int len, int offset = 0) {
+  int *Arr = new int[len];
+  for (unsigned j = 0; j < len; j++)
+    Arr[j] = j + offset;
+  return ArrayRef<int>(Arr, len);
+}
+
+/**
+ * Creates an interleaving mask of the given length.
+ */
+static ArrayRef<int> createInterleavingMask(int len) {
+  int Step = len / 2;
+  int *Arr = new int[len];
+  int idx = 0;
+  for (unsigned j = 0; j < len; j += 2) {
+    Arr[j] = idx;
+    Arr[j + 1] = idx + Step;
+    idx++;
+  };
+  return ArrayRef<int>(Arr, len);
+}
+
+}; // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -507,6 +507,23 @@
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
 
+bool TargetTransformInfo::supportsComplexArithmetic() const {
+  return TTIImpl->supportsComplexArithmetic();
+}
+
+Value *
+TargetTransformInfo::createComplexArithmeticIR(ComplexArithmeticGraph &G,
+                                               Value *InputA, Value *InputB,
+                                               int &GeneratedIntrinsicCount) {
+  return TTIImpl->createComplexArithmeticIR(G, InputA, InputB,
+                                            GeneratedIntrinsicCount);
+}
+
+bool TargetTransformInfo::matchComplexArithmeticIR(Instruction *I,
+                                                   ComplexArithmeticGraph &G) {
+  return TTIImpl->matchComplexArithmeticIR(I, G);
+}
+
 bool TargetTransformInfo::enableAggressiveInterleaving(
     bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -145,6 +145,7 @@
 #include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/Transforms/Scalar/ComplexArithmetic.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/Transforms/Scalar/ConstraintElimination.h"
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -241,6 +241,7 @@
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
+FUNCTION_PASS("complex-arithmetic", ComplexArithmeticPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -423,12 +423,17 @@
   TargetPassConfig::addIRPasses();
 
   // Run the parallel DSP pass.
-  if (getOptLevel() == CodeGenOpt::Aggressive) 
+  if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createARMParallelDSPPass());
 
+  // Match complex arithmetic patterns
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(createComplexArithmeticPass());
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+  }
 
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -126,6 +126,11 @@
       std::function<void(Instruction *, unsigned, APInt, APInt &)>
           SimplifyAndSetOp) const;
 
+  bool supportsComplexArithmetic() const;
+  Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA,
+                                   Value *InputB, int &GeneratedIntrinsicCount);
+  bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G);
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -31,6 +31,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Scalar/ComplexArithmetic.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
@@ -2344,3 +2345,169 @@
     return false;
   return true;
 }
+
+bool ARMTTIImpl::supportsComplexArithmetic() const {
+  return ST->hasMVEFloatOps();
+}
+
+namespace {
+struct ARMComplexArithmeticMetadata {
+  bool Halving;
+};
+} // namespace
+
+Value *ARMTTIImpl::createComplexArithmeticIR(ComplexArithmeticGraph &G,
+                                             Value *InputA, Value *InputB,
+                                             int &GeneratedIntrinsicCount) {
+  auto *Ty = InputA->getType();
+  if (!isa<FixedVectorType>(Ty))
+    return nullptr;
+  auto *VTy = cast<FixedVectorType>(Ty);
+
+  // Cannot widen complex intrinsics to fill vectors
+  if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128)
+    return nullptr;
+
+  // MVE does not support double complex operations
+  if (VTy->getScalarType()->isDoubleTy())
+    return nullptr;
+
+  if (G.getType() == ComplexArithmeticGraph::Complex_Mul) {
+
+    IRBuilder<> B(G.getCurrentInstruction());
+    auto *IntTy = Type::getInt32Ty(G.getCurrentInstruction()->getContext());
+    int RotIdx = G.getRotation() / 90;
+
+    auto *ConstMulRot = ConstantInt::get(IntTy, RotIdx);
+    auto *ConstMlaRot = ConstantInt::get(IntTy, (RotIdx + 1) % 4);
+    auto *Mul = B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
+                                  {ConstMulRot, InputA, InputB});
+    auto *Mla = B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
+                                  {ConstMlaRot, Mul, InputA, InputB});
+    GeneratedIntrinsicCount = 2;
+    return Mla;
+  }
+
+  if (G.getType() == ComplexArithmeticGraph::Complex_Add) {
+    IRBuilder<> B(G.getCurrentInstruction());
+
+    auto *IntTy = Type::getInt32Ty(G.getContext());
+    unsigned HalvingVal = 1;
+
+    auto *Meta = G.getUserData<ARMComplexArithmeticMetadata>();
+    if (Meta && Meta->Halving)
+      HalvingVal = 0;
+
+    auto *Halving = ConstantInt::get(IntTy, HalvingVal);
+
+    unsigned Rotation = G.getRotation();
+    unsigned RotKey;
+    if (Rotation == 90)
+      RotKey = 0;
+    else if (G.getRotation() == 270)
+      RotKey = 1;
+    else
+      return nullptr; // Invalid rotation for arm_mve_vcaddq
+
+    auto *RotVal = ConstantInt::get(IntTy, RotKey);
+    GeneratedIntrinsicCount = 1;
+    return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
+                             {Halving, RotVal, InputA, InputB});
+  }
+
+  return nullptr;
+}
+
+static bool matchComplexArithmeticHalvingAdd(Instruction *I,
+                                             ComplexArithmeticGraph &G) {
+  auto *SVI = dyn_cast<ShuffleVectorInst>(I);
+  if (!SVI)
+    return false;
+
+  if (!SVI->getType()->getElementType()->isIntegerTy(32))
+    return false;
+
+  Value *ShuffleAR = nullptr;
+  Value *ShuffleAI = nullptr;
+  Value *ShuffleBR = nullptr;
+  Value *ShuffleBI = nullptr;
+
+  auto *Op0 = cast<Instruction>(SVI->getOperand(0));
+  auto *Op1 = cast<Instruction>(SVI->getOperand(1));
+
+  Op0 = cast<Instruction>(Op0->getOperand(0));
+  Op1 = cast<Instruction>(Op1->getOperand(0));
+
+  unsigned Rotation;
+  if (Op0->getOpcode() == Instruction::FSub &&
+      Op1->getOpcode() == Instruction::FAdd) {
+    Rotation = 90;
+  } else if (Op0->getOpcode() == Instruction::FAdd &&
+             Op1->getOpcode() == Instruction::FSub) {
+    Rotation = 270;
+  } else {
+    return false;
+  }
+
+  auto ShuffleMask = createInterleavingMask(SVI->getShuffleMask().size());
+
+  auto *FloatTy = Type::getFloatTy(G.getContext());
+
+  if (Rotation == 90) {
+
+    auto FSubPattern = m_FSub(m_Value(ShuffleBR), m_Value(ShuffleAI));
+    auto FAddPattern = m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR));
+
+    auto *FP = ConstantFP::get(FixedVectorType::get(FloatTy, 4), 0.5f);
+    auto Mul0Pattern = m_FMul(FSubPattern, m_SpecificFP(0.5f));
+    auto Mul1Pattern = m_FMul(FAddPattern, m_SpecificFP(0.5f));
+
+    if (!match(SVI, m_Shuffle(Mul0Pattern, Mul1Pattern,
+                              m_SpecificMask(ShuffleMask)))) {
+      dbgs()
+          << "SVI does not match expected pattern for complex halving add rot "
+          << Rotation << ".\n";
+      return false;
+    }
+  } else if (Rotation == 270) {
+    if (!match(SVI, m_Shuffle(m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)),
+                              m_FSub(m_Value(ShuffleAI), m_Value(ShuffleBR)),
+                              m_SpecificMask(ShuffleMask)))) {
+      dbgs()
+          << "SVI does not match expected pattern for complex halving add rot "
+          << Rotation << ".\n";
+      return false;
+    }
+  }
+
+  G.addNode(cast<Instruction>(ShuffleAR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(ShuffleAI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+  G.addNode(cast<Instruction>(ShuffleBR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(ShuffleBI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+
+  G.addNode(Op0, ComplexArithmeticGraph::AddOperand);
+  G.addNode(Op1, ComplexArithmeticGraph::AddOperand);
+
+  G.setType(ComplexArithmeticGraph::Complex_Add);
+  G.setRotation(Rotation);
+  auto *Meta = new ARMComplexArithmeticMetadata;
+  Meta->Halving = true;
+  G.setUserData(Meta);
+
+  return true;
+}
+
+bool ARMTTIImpl::matchComplexArithmeticIR(Instruction *I,
+                                          ComplexArithmeticGraph &G) {
+  //  if (I->getType()->isIntOrIntVectorTy(32)) {
+  //    if (matchComplexArithmeticHalvingAdd(I, G))
+  //      return true;
+  //  }
+  return false;
+}
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -4,6 +4,7 @@
   AnnotationRemarks.cpp
   BDCE.cpp
   CallSiteSplitting.cpp
+  ComplexArithmetic.cpp
   ConstantHoisting.cpp
   ConstraintElimination.cpp
   CorrelatedValuePropagation.cpp
diff --git a/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp b/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp
@@ -0,0 +1,486 @@
+//===- ComplexArithmeticPass.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ComplexArithmetic.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "complex-arithmetic"
+
+STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated");
+
+static cl::opt<bool> ComplexArithmeticEnabled(
+    "enable-complex-arithmetic",
+    cl::desc("Enable generation of complex arithmetic instructions"),
+    cl::init(true), cl::Hidden);
+
+namespace {
+
+class ComplexArithmeticLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ComplexArithmeticLegacyPass() : FunctionPass(ID) {
+    initializeComplexArithmeticLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Complex Arithmetic Pass"; }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+};
+
+class ComplexArithmetic {
+public:
+  ComplexArithmetic(TargetTransformInfo *tti) : TTI(tti) {}
+  bool runOnFunction(Function &F);
+
+private:
+  bool
+  evaluateComplexArithmeticBasicBlock(BasicBlock *B,
+                                      SmallVector<Instruction *> &DeadInsts);
+
+  void cleanupDeadInsts(SmallVector<Instruction *> &DeadInsts);
+
+  TargetTransformInfo *TTI = nullptr;
+};
+
+}; // namespace
+
+char ComplexArithmeticLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ComplexArithmeticLegacyPass, DEBUG_TYPE,
+                      "Complex Arithmetic", false, false)
+INITIALIZE_PASS_END(ComplexArithmeticLegacyPass, DEBUG_TYPE,
+                    "Complex Arithmetic", false, false)
+
+PreservedAnalyses ComplexArithmeticPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!ComplexArithmetic(&TTI).runOnFunction(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+FunctionPass *llvm::createComplexArithmeticPass() {
+  return new ComplexArithmeticLegacyPass();
+}
+
+bool ComplexArithmeticLegacyPass::runOnFunction(Function &F) {
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return ComplexArithmetic(&TTI).runOnFunction(F);
+}
+
+static bool HasBeenDisabled = false;
+bool ComplexArithmetic::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "ComplexArithmetic::runOnFunction"
+                    << ".\n");
+
+  if (!ComplexArithmeticEnabled) {
+    LLVM_DEBUG(if (!HasBeenDisabled) dbgs()
+               << "Complex has been explicitly disabled.\n");
+    return false;
+  }
+
+  if (!TTI->supportsComplexArithmetic()) {
+    LLVM_DEBUG(if (!HasBeenDisabled) dbgs()
+               << "Complex has been disabled, "
+                  "target does not support lowering of complex numbers.\n");
+    return false;
+  }
+
+  bool Changed = false;
+  SmallVector<Instruction *> DeadInsts;
+  for (auto &B : F)
+    Changed |= evaluateComplexArithmeticBasicBlock(&B, DeadInsts);
+
+  if (Changed)
+    cleanupDeadInsts(DeadInsts);
+
+  return Changed;
+}
+
+/**
+ * Checks the given mask, and determines whether said mask is interleaving.
+ *
+ * To be interleaving, a mask must alternate between `i` and `i + (Length / 2)`,
+ * and must contain all numbers within the range of `[0..Length)`
+ * (e.g. a 4x vector interleaving mask would be <0, 2, 1, 3>).
+ */
+static bool isInterleavingMask(ArrayRef<int> Mask, int NumElements) {
+  if (Mask.size() != NumElements * 2) {
+    return false;
+  }
+
+  for (unsigned i = 0; i < NumElements; ++i) {
+    if (Mask[(i * 2) + 1] != (Mask[i * 2] + NumElements)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Checks the mask of the given ShuffleVectorInst, and determines whether said
+ * shuffle is interleaving. See isInterleavingMask.
+ */
+static bool isInterleaving(ShuffleVectorInst *SVI) {
+  auto *Ty = dyn_cast<FixedVectorType>(SVI->getOperand(0)->getType());
+  if (!Ty)
+    return false;
+
+  unsigned NumElements = Ty->getNumElements();
+  return isInterleavingMask(SVI->getShuffleMask(), NumElements);
+}
+
+/**
+ * Checks the given mask, and determines whether said mask is deinterleaving.
+ *
+ * To be deinterleaving, a mask must match the pattern `i * 2`, with an optional
+ * offset of 1. (e.g. a 4x vector deinterleaving mask would look like <0, 2, 4,
+ * 6> or <1, 3, 5, 7>).
+ */
+static bool isDeinterleavingMask(ArrayRef<int> Mask, int NumElements) {
+  if (Mask.size() != NumElements)
+    return false;
+
+  for (unsigned i = 0; i < Mask.size() - 1; ++i) {
+    if (Mask[i + 1] != (Mask[i] + NumElements))
+      return false;
+  }
+
+  return true;
+}
+
+static bool matchComplexMul(ShuffleVectorInst *SVI, ComplexArithmeticGraph &G) {
+
+  unsigned LikelyRotation = 0;
+
+  Value *LeftShuffleAR;
+  Value *LeftShuffleAI;
+  Value *LeftShuffleBR;
+  Value *LeftShuffleBI;
+
+  Value *RightShuffleAR;
+  Value *RightShuffleAI;
+  Value *RightShuffleBR;
+  Value *RightShuffleBI;
+
+  auto Mask = createInterleavingMask(SVI->getShuffleMask().size());
+
+  auto InterleaveShuffleRot0Pattern = m_Shuffle(
+      m_FSub(m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR)),
+             m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI))),
+      m_FAdd(m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR)),
+             m_FMul(m_Value(RightShuffleBR), m_Value(RightShuffleAI))),
+      m_SpecificMask(Mask));
+
+  auto InterleaveShuffleRot180Pattern = m_Shuffle(
+      m_FSub(m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI)),
+             m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR))),
+      m_FSub(m_FMul(m_Value(RightShuffleBR), m_FNeg(m_Value(RightShuffleAI))),
+             m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR))),
+      m_SpecificMask(Mask));
+
+  if (match(SVI, InterleaveShuffleRot0Pattern))
+    LikelyRotation = 0;
+  else if (match(SVI, InterleaveShuffleRot180Pattern))
+    LikelyRotation = 180;
+  else {
+    LLVM_DEBUG(dbgs() << "SVI does not match expected patterns.\n");
+    return false;
+  }
+
+  if (LeftShuffleAR != RightShuffleAR)
+    return false;
+  if (LeftShuffleAI != RightShuffleAI)
+    return false;
+  if (LeftShuffleBR != RightShuffleBR)
+    return false;
+  if (LeftShuffleBI != RightShuffleBI)
+    return false;
+
+  G.addNode(cast<Instruction>(LeftShuffleAR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(LeftShuffleAI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+  G.addNode(cast<Instruction>(LeftShuffleBR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(LeftShuffleBI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+
+  auto *Op0 = cast<Instruction>(SVI->getOperand(0));
+  auto *FAdd = cast<Instruction>(SVI->getOperand(1));
+
+  G.addNode(Op0, ComplexArithmeticGraph::Real);
+  G.addNode(FAdd, ComplexArithmeticGraph::Imaginary);
+
+  G.addNode(cast<Instruction>(Op0->getOperand(0)),
+            ComplexArithmeticGraph::Discover);
+  G.addNode(cast<Instruction>(Op0->getOperand(1)),
+            ComplexArithmeticGraph::Discover);
+  G.addNode(cast<Instruction>(FAdd->getOperand(0)),
+            ComplexArithmeticGraph::Discover);
+  G.addNode(cast<Instruction>(FAdd->getOperand(1)),
+            ComplexArithmeticGraph::Discover);
+
+  G.setType(ComplexArithmeticGraph::Complex_Mul);
+  G.setRotation(LikelyRotation);
+
+  return true;
+}
+
+static bool matchComplexAdd(ShuffleVectorInst *SVI, ComplexArithmeticGraph &G) {
+  Value *ShuffleAR;
+  Value *ShuffleAI;
+  Value *ShuffleBR;
+  Value *ShuffleBI;
+
+  auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
+
+  if (!Op0 || !Op1)
+    return false;
+
+  unsigned Rotation;
+  if (Op0->getOpcode() == Instruction::FSub &&
+      Op1->getOpcode() == Instruction::FAdd) {
+    Rotation = 90;
+  } else if (Op0->getOpcode() == Instruction::FAdd &&
+             Op1->getOpcode() == Instruction::FSub) {
+    Rotation = 270;
+  } else {
+    return false;
+  }
+
+  auto ShuffleMask = createInterleavingMask(SVI->getShuffleMask().size());
+
+  if (Rotation == 90) {
+    if (!match(SVI, m_Shuffle(m_FSub(m_Value(ShuffleAR), m_Value(ShuffleBI)),
+                              m_FAdd(m_Value(ShuffleAI), m_Value(ShuffleBR)),
+                              m_SpecificMask(ShuffleMask)))) {
+      LLVM_DEBUG(
+          dbgs() << "SVI does not match expected pattern for complex add rot "
+                 << Rotation << ".\n");
+      return false;
+    }
+  } else if (Rotation == 270) {
+    if (!match(SVI, m_Shuffle(m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)),
+                              m_FSub(m_Value(ShuffleAI), m_Value(ShuffleBR)),
+                              m_SpecificMask(ShuffleMask)))) {
+      LLVM_DEBUG(
+          dbgs() << "SVI does not match expected pattern for complex add rot "
+                 << Rotation << ".\n");
+      return false;
+    }
+  }
+
+  if (!isa<ShuffleVectorInst>(ShuffleAR) ||
+      !isa<ShuffleVectorInst>(ShuffleAI) ||
+      !isa<ShuffleVectorInst>(ShuffleBR) ||
+      !isa<ShuffleVectorInst>(ShuffleAI)) {
+    LLVM_DEBUG(dbgs() << "SVI does not match expected pattern for complex add, "
+                         "inputs aren't all shuffles.\n");
+    return false;
+  }
+
+  G.addNode(cast<Instruction>(ShuffleAR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(ShuffleAI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+  G.addNode(cast<Instruction>(ShuffleBR),
+            ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+  G.addNode(cast<Instruction>(ShuffleBI),
+            ComplexArithmeticGraph::Shuffle |
+                ComplexArithmeticGraph::Imaginary);
+
+  G.addNode(Op0, ComplexArithmeticGraph::AddOperand);
+  G.addNode(Op1, ComplexArithmeticGraph::AddOperand);
+
+  G.setType(ComplexArithmeticGraph::Complex_Add);
+  G.setRotation(Rotation);
+
+  return true;
+}
+
+static bool traverseAndPopulateGraph(TargetTransformInfo *TTI, Instruction *I,
+                                     ComplexArithmeticGraph &G) {
+  G.setCurrentInstruction(I);
+
+  // Shuffle mask needs to interleave vectors
+  // e.g.
+  //   <4 x i32> <0, 2, 1, 3>
+  //   <8 x i32> <0, 4, 1, 5, 2, 6, 3, 7>
+
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+    if (!isInterleaving(SVI)) {
+      LLVM_DEBUG(dbgs() << "SVI doesn't appear to perform interleaving"
+                        << ".\n");
+      return false;
+    }
+
+    if (matchComplexMul(SVI, G))
+      return true;
+
+    if (matchComplexAdd(SVI, G))
+      return true;
+
+    if (TTI->matchComplexArithmeticIR(SVI, G))
+      return true;
+  }
+
+  return false;
+}
+
+static bool substituteGraph(TargetTransformInfo *TTI, Instruction *I,
+                            ComplexArithmeticGraph &G,
+                            SmallVector<Instruction *> &DeadInsts) {
+  G.setCurrentInstruction(I);
+
+  SmallVector<Instruction *> RealShuffles = G.getNodesOfType(
+      ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real);
+
+  auto *LoadA = RealShuffles[0]->getOperand(0);
+  auto *LoadB = RealShuffles[1]->getOperand(0);
+
+  auto *TyA = cast<FixedVectorType>(LoadA->getType());
+
+  const unsigned MaxVectorWidth = 128;
+  unsigned NumBits = TyA->getScalarSizeInBits() * TyA->getNumElements();
+  unsigned NumElementsPerVector = MaxVectorWidth / TyA->getScalarSizeInBits();
+  int GeneratedIntrinsics;
+  if (NumBits > MaxVectorWidth) {
+    LLVM_DEBUG(dbgs() << "Split required, " << NumBits
+                      << " is greater than the max vector width ("
+                      << MaxVectorWidth << ")"
+                      << ".\n");
+    if (NumBits % MaxVectorWidth != 0) {
+      LLVM_DEBUG(dbgs() << "Vector can't be split evenly"
+                        << ".\n");
+      return false;
+    }
+
+    IRBuilder<> B(I);
+
+    unsigned SplitCount = NumBits / MaxVectorWidth;
+
+    if (SplitCount > 2) {
+      LLVM_DEBUG(dbgs() << "Cannot split operation beyond 2"
+                        << ".\n");
+      return false;
+    }
+
+    SmallVector<Value *> CreatedInsts;
+    SmallVector<Value *> ComplexIR;
+    for (unsigned i = 0; i < SplitCount; ++i) {
+      ArrayRef<int> Mask =
+          createContiguousMask(NumElementsPerVector, NumElementsPerVector * i);
+      auto *Undef = UndefValue::get(LoadA->getType());
+      auto *ShuffleA = B.CreateShuffleVector(LoadA, Undef, Mask);
+      auto *ShuffleB = B.CreateShuffleVector(LoadB, Undef, Mask);
+
+      CreatedInsts.push_back(ShuffleA);
+      CreatedInsts.push_back(ShuffleB);
+
+      auto *IR = TTI->createComplexArithmeticIR(G, ShuffleA, ShuffleB,
+                                                GeneratedIntrinsics);
+      if (IR == nullptr) {
+        for (auto &item : CreatedInsts)
+          DeadInsts.push_back(cast<Instruction>(item));
+        return false;
+      }
+      NumComplexIntrinsics += GeneratedIntrinsics;
+      ComplexIR.push_back(IR);
+      CreatedInsts.push_back(IR);
+    }
+
+    ArrayRef<int> Mask = createContiguousMask(NumElementsPerVector * 2);
+    auto *Shuffle = B.CreateShuffleVector(ComplexIR[0], ComplexIR[1], Mask);
+    I->replaceAllUsesWith(Shuffle);
+  } else {
+    auto *Mla =
+        TTI->createComplexArithmeticIR(G, LoadA, LoadB, GeneratedIntrinsics);
+    if (Mla == nullptr)
+      return false;
+    NumComplexIntrinsics += GeneratedIntrinsics;
+    I->replaceAllUsesWith(Mla);
+  }
+
+  for (auto &item : G.getAllNodes())
+    DeadInsts.push_back(item);
+
+  return true;
+}
+
+bool ComplexArithmetic::evaluateComplexArithmeticBasicBlock(
+    BasicBlock *B, SmallVector<Instruction *> &DeadInsts) {
+  ComplexArithmeticGraph Graph;
+
+  bool Changed = false;
+  bool Substituted = false;
+
+  for (auto &I : *B) {
+    if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+      if (isInterleaving(SVI)) {
+        Graph.addNode(SVI, ComplexArithmeticGraph::Shuffle);
+        Changed = traverseAndPopulateGraph(TTI, SVI, Graph);
+      }
+    }
+    if (Changed) {
+      Substituted = substituteGraph(TTI, &I, Graph, DeadInsts);
+      Changed = false;
+    }
+  }
+
+  return Substituted;
+}
+
+void ComplexArithmetic::cleanupDeadInsts(
+    SmallVector<Instruction *> &DeadInsts) {
+
+  // TODO clean up the dead instructions better. (Ask in review?)
+  unsigned iter = 0;
+  unsigned count = DeadInsts.size();
+  unsigned remaining = DeadInsts.size();
+  while (!DeadInsts.empty() && remaining > 0 && iter < count) {
+    ++iter;
+    remaining = 0;
+    for (auto *It = DeadInsts.begin(); It != DeadInsts.end(); It++) {
+      auto *I = *It;
+
+      if (I->getParent())
+        remaining++;
+
+      if (I->getNumUses() == 0 && I->getParent()) {
+        remaining--;
+        I->eraseFromParent();
+      }
+    }
+  }
+
+  DeadInsts.clear();
+}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -45,6 +45,7 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Transform functions to use DSP intrinsics
+; CHECK-NEXT:      Complex Arithmetic Pass
 ; CHECK-NEXT:      Interleaved Access Pass
 ; CHECK-NEXT:      Type Promotion
 ; CHECK-NEXT:      CodeGen Prepare
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS
+
+; STATS: "complex-arithmetic.NumComplexIntrinsics": 3
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_add_v2f16(<2 x half> %wide.vec, <2 x half> %wide.vec23, <2 x half>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v2f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    ldr r0, [sp, #16]
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vsub.f16 s4, s0, s2
+; CHECK-NEXT:    vadd.f16 s2, s2, s0
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { half, half }* null to <2 x half>*
+  %wide.vec2 = load <2 x half>, <2 x half>* null, align 4
+  %strided.vec = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %strided.vec22 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %wide.vec233 = load <2 x half>, <2 x half>* null, align 4
+  %strided.vec24 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %strided.vec25 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x half> %strided.vec24, %strided.vec22
+  %1 = fadd fast <1 x half> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %interleaved.vec, <2 x half>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v4f16(<4 x half> %wide.vec, <4 x half> %wide.vec23, <4 x half>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v4f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    ldr r0, [sp, #16]
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s8, s1
+; CHECK-NEXT:    vsub.f16 q3, q2, q1
+; CHECK-NEXT:    vadd.f16 q1, q1, q2
+; CHECK-NEXT:    vmovx.f16 s13, s12
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vins.f16 s13, s2
+; CHECK-NEXT:    vmov r1, r2, d6
+; CHECK-NEXT:    strd r1, r2, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { half, half }* null to <4 x half>*
+  %wide.vec2 = load <4 x half>, <4 x half>* null, align 4
+  %strided.vec = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer,   <2 x i32> <i32 0, i32 2>
+  %strided.vec22 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %wide.vec233 = load <4 x half>, <4 x half>* null, align 4
+  %strided.vec24 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %strided.vec25 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x half> %strided.vec24, %strided.vec22
+  %1 = fadd fast <2 x half> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x half> %interleaved.vec, <4 x half>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v8f16(<8 x half> %wide.vec, <8 x half> %wide.vec23, <8 x half>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v8f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    ldr r0, [sp, #16]
+; CHECK-NEXT:    vcadd.f16 q0, q0, q0, #90
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { half, half }* null to <8 x half>*
+  %wide.vec2 = load <8 x half>, <8 x half>* null, align 4
+  %strided.vec = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec22 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %wide.vec233 = load <8 x half>, <8 x half>* null, align 4
+  %strided.vec24 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec25 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x half> %strided.vec24, %strided.vec22
+  %1 = fadd fast <4 x half> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %interleaved.vec, <8 x half>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v16f16(<16 x half> %wide.vec, <16 x half> %wide.vec23, <16 x half>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v16f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    ldr r0, [sp, #48]
+; CHECK-NEXT:    vcadd.f16 q0, q0, q0, #90
+; CHECK-NEXT:    vcadd.f16 q1, q1, q1, #90
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { half, half }* null to <16 x half>*
+  %wide.vec2 = load <16 x half>, <16 x half>* null, align 4
+  %strided.vec = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec22 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %wide.vec233 = load <16 x half>, <16 x half>* null, align 4
+  %strided.vec24 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec25 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x half> %strided.vec24, %strided.vec22
+  %1 = fadd fast <8 x half> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x half> %interleaved.vec, <16 x half>* %lsr.iv5153, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS
+
+; STATS: "complex-arithmetic.NumComplexIntrinsics": 6
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_mul_v2f16(half* %a, half* %b, half* %c) #0 {
+; CHECK-LABEL: complex_mul_v2f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r2, [r0]
+; CHECK-NEXT:    ldr r3, [r1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vmul.f16 s8, s6, s2
+; CHECK-NEXT:    vmul.f16 s2, s4, s2
+; CHECK-NEXT:    vfnms.f16 s8, s4, s0
+; CHECK-NEXT:    vfma.f16 s2, s6, s0
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    str r2, [r0]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast half* %a to <2 x half>*
+  %b.ptr = bitcast half* %b to <2 x half>*
+  %c.ptr = bitcast half* %c to <2 x half>*
+  %a.val = load <2 x half>, <2 x half>* %a.ptr
+  %b.val = load <2 x half>, <2 x half>* %b.ptr
+  %strided.vec   = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> <i32 0>
+  %strided.vec46 = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> <i32 1>
+  %strided.vec48 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> <i32 0>
+  %strided.vec49 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x half> %strided.vec48, %strided.vec
+  %1 = fmul fast <1 x half> %strided.vec49, %strided.vec46
+  %2 = fsub fast <1 x half> %0, %1
+  %3 = fmul fast <1 x half> %strided.vec49, %strided.vec
+  %4 = fmul fast <1 x half> %strided.vec48, %strided.vec46
+  %5 = fadd fast <1 x half> %3, %4
+  %6 = bitcast half* undef to <2 x half>*
+  %interleaved.vec = shufflevector <1 x half> %2, <1 x half> %5, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %interleaved.vec, <2 x half>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v4f16(half* %a, half* %b, half* %c) #0 {
+; CHECK-LABEL: complex_mul_v4f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrd r2, r12, [r0]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrd r2, r3, [r1]
+; CHECK-NEXT:    vmov.32 q0[1], r12
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s16, s2
+; CHECK-NEXT:    vmul.f16 q3, q4, q2
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vneg.f16 q3, q3
+; CHECK-NEXT:    vfma.f16 q3, q1, q0
+; CHECK-NEXT:    vmul.f16 q1, q1, q2
+; CHECK-NEXT:    vfma.f16 q1, q4, q0
+; CHECK-NEXT:    vmovx.f16 s13, s12
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    str r3, [r0]
+; CHECK-NEXT:    str r2, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast half* %a to <4 x half>*
+  %b.ptr = bitcast half* %b to <4 x half>*
+  %c.ptr = bitcast half* %c to <4 x half>*
+  %a.val = load <4 x half>, <4 x half>* %a.ptr
+  %b.val = load <4 x half>, <4 x half>* %b.ptr
+  %strided.vec   = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec46 = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec48 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec49 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x half> %strided.vec48, %strided.vec
+  %1 = fmul fast <2 x half> %strided.vec49, %strided.vec46
+  %2 = fsub fast <2 x half> %0, %1
+  %3 = fmul fast <2 x half> %strided.vec49, %strided.vec
+  %4 = fmul fast <2 x half> %strided.vec48, %strided.vec46
+  %5 = fadd fast <2 x half> %3, %4
+  %6 = bitcast half* undef to <4 x half>*
+  %interleaved.vec = shufflevector <2 x half> %2, <2 x half> %5, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x half> %interleaved.vec, <4 x half>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v8f16(half* %a, half* %b, half* %c) #0 {
+; CHECK-LABEL: complex_mul_v8f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vcmul.f16 q2, q0, q1, #0
+; CHECK-NEXT:    vcmla.f16 q2, q0, q1, #90
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast half* %a to <8 x half>*
+  %b.ptr = bitcast half* %b to <8 x half>*
+  %c.ptr = bitcast half* %c to <8 x half>*
+  %a.val = load <8 x half>, <8 x half>* %a.ptr
+  %b.val = load <8 x half>, <8 x half>* %b.ptr
+  %strided.vec = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec46 = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec48 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec49 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x half> %strided.vec48, %strided.vec
+  %1 = fmul fast <4 x half> %strided.vec49, %strided.vec46
+  %2 = fsub fast <4 x half> %0, %1
+  %3 = fmul fast <4 x half> %strided.vec49, %strided.vec
+  %4 = fmul fast <4 x half> %strided.vec48, %strided.vec46
+  %5 = fadd fast <4 x half> %3, %4
+  %6 = bitcast half* undef to <8 x half>*
+  %interleaved.vec = shufflevector <4 x half> %2, <4 x half> %5, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %interleaved.vec, <8 x half>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v16f16(half* %a, half* %b, half* %c) #0 {
+; CHECK-LABEL: complex_mul_v16f16:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r1, #16]
+; CHECK-NEXT:    vcmul.f16 q4, q1, q2, #0
+; CHECK-NEXT:    vcmla.f16 q4, q1, q2, #90
+; CHECK-NEXT:    vcmul.f16 q1, q0, q3, #0
+; CHECK-NEXT:    vcmla.f16 q1, q0, q3, #90
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r0]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast half* %a to <16 x half>*
+  %b.ptr = bitcast half* %b to <16 x half>*
+  %c.ptr = bitcast half* %c to <16 x half>*
+  %a.val = load <16 x half>, <16 x half>* %a.ptr
+  %b.val = load <16 x half>, <16 x half>* %b.ptr
+  %strided.vec = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec46 = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %strided.vec48 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec49 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x half> %strided.vec48, %strided.vec
+  %1 = fmul fast <8 x half> %strided.vec49, %strided.vec46
+  %2 = fsub fast <8 x half> %0, %1
+  %3 = fmul fast <8 x half> %strided.vec49, %strided.vec
+  %4 = fmul fast <8 x half> %strided.vec48, %strided.vec46
+  %5 = fadd fast <8 x half> %3, %4
+  %6 = bitcast half* undef to <16 x half>*
+  %interleaved.vec = shufflevector <8 x half> %2, <8 x half> %5, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x half> %interleaved.vec, <16 x half>* %6, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS
+
+; STATS: "complex-arithmetic.NumComplexIntrinsics": 3
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_add_v2f32(<2 x float> %wide.vec, <2 x float> %wide.vec23, <2 x float>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v2f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    ldr r0, [sp, #16]
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsub.f32 s2, s0, s1
+; CHECK-NEXT:    vadd.f32 s4, s1, s0
+; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vstr s4, [r0, #4]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { float, float }* null to <2 x float>*
+  %wide.vec2 = load <2 x float>, <2 x float>* null, align 4
+  %strided.vec = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %strided.vec22 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %wide.vec233 = load <2 x float>, <2 x float>* null, align 4
+  %strided.vec24 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %strided.vec25 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x float> %strided.vec24, %strided.vec22
+  %1 = fadd fast <1 x float> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %interleaved.vec, <2 x float>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v4f32(<4 x float> %wide.vec, <4 x float> %wide.vec23, <4 x float>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v4f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d3, r2, r3
+; CHECK-NEXT:    vmov d2, r0, r1
+; CHECK-NEXT:    ldr r0, [sp, #16]
+; CHECK-NEXT:    vcadd.f32 q0, q1, q1, #90
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { float, float }* null to <4 x float>*
+  %wide.vec2 = load <4 x float>, <4 x float>* null, align 4
+  %strided.vec = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer,   <2 x i32> <i32 0, i32 2>
+  %strided.vec22 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %wide.vec233 = load <4 x float>, <4 x float>* null, align 4
+  %strided.vec24 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %strided.vec25 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %strided.vec24, %strided.vec22
+  %1 = fadd fast <2 x float> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x float> %interleaved.vec, <4 x float>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v8f32(<8 x float> %wide.vec, <8 x float> %wide.vec23, <8 x float>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v8f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    vmov d2, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vmov d3, r2, r3
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    ldr r0, [sp, #48]
+; CHECK-NEXT:    vcadd.f32 q0, q1, q1, #90
+; CHECK-NEXT:    vcadd.f32 q1, q2, q2, #90
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { float, float }* null to <8 x float>*
+  %wide.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec22 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %wide.vec233 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec24 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec25 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x float> %strided.vec24, %strided.vec22
+  %1 = fadd fast <4 x float> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %lsr.iv5153, align 4
+  br label %vector.body
+}
+
+define void @complex_add_v16f32(<16 x float> %wide.vec, <16 x float> %wide.vec23, <16 x float>* %lsr.iv5153) #0 {
+; CHECK-LABEL: complex_add_v16f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    add r2, sp, #112
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    add r0, sp, #144
+; CHECK-NEXT:    add r1, sp, #128
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    ldr r0, [sp, #224]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s0
+; CHECK-NEXT:    vmov.f32 s19, s2
+; CHECK-NEXT:    vmov.f32 s22, s1
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vmov.f32 s17, s10
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vsub.f32 q6, q4, q5
+; CHECK-NEXT:    vadd.f32 q7, q5, q4
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s5
+; CHECK-NEXT:    vmov.f32 s21, s3
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vsub.f32 q0, q4, q5
+; CHECK-NEXT:    vadd.f32 q1, q5, q4
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
+; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
+; CHECK-NEXT:    vst20.32 {q6, q7}, [r1]
+; CHECK-NEXT:    vst21.32 {q6, q7}, [r1]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv51531 = bitcast { float, float }* null to <16 x float>*
+  %wide.vec2 = load <16 x float>, <16 x float>* null, align 4
+  %strided.vec = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec22 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %wide.vec233 = load <16 x float>, <16 x float>* null, align 4
+  %strided.vec24 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec25 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x float> %strided.vec24, %strided.vec22
+  %1 = fadd fast <8 x float> %strided.vec25, %strided.vec
+  %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x float> %interleaved.vec, <16 x float>* %lsr.iv5153, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS
+
+; STATS: "complex-arithmetic.NumComplexIntrinsics": 6
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_mul_v2f32(float* %a, float* %b, float* %c) #0 {
+; CHECK-LABEL: complex_mul_v2f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldr s2, [r0, #4]
+; CHECK-NEXT:    vldr s4, [r1, #4]
+; CHECK-NEXT:    vldr s6, [r1]
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vmul.f32 s8, s4, s2
+; CHECK-NEXT:    vmul.f32 s2, s6, s2
+; CHECK-NEXT:    vfma.f32 s2, s4, s0
+; CHECK-NEXT:    vfnms.f32 s8, s6, s0
+; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vstr s8, [r0]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast float* %a to <2 x float>*
+  %b.ptr = bitcast float* %b to <2 x float>*
+  %c.ptr = bitcast float* %c to <2 x float>*
+  %a.val = load <2 x float>, <2 x float>* %a.ptr
+  %b.val = load <2 x float>, <2 x float>* %b.ptr
+  %strided.vec   = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> <i32 0>
+  %strided.vec46 = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> <i32 1>
+  %strided.vec48 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> <i32 0>
+  %strided.vec49 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x float> %strided.vec48, %strided.vec
+  %1 = fmul fast <1 x float> %strided.vec49, %strided.vec46
+  %2 = fsub fast <1 x float> %0, %1
+  %3 = fmul fast <1 x float> %strided.vec49, %strided.vec
+  %4 = fmul fast <1 x float> %strided.vec48, %strided.vec46
+  %5 = fadd fast <1 x float> %3, %4
+  %6 = bitcast float* undef to <2 x float>*
+  %interleaved.vec = shufflevector <1 x float> %2, <1 x float> %5, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %interleaved.vec, <2 x float>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v4f32(float* %a, float* %b, float* %c) #0 {
+; CHECK-LABEL: complex_mul_v4f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #0
+; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast float* %a to <4 x float>*
+  %b.ptr = bitcast float* %b to <4 x float>*
+  %c.ptr = bitcast float* %c to <4 x float>*
+  %a.val = load <4 x float>, <4 x float>* %a.ptr
+  %b.val = load <4 x float>, <4 x float>* %b.ptr
+  %strided.vec   = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec46 = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec48 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec49 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec48, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec49, %strided.vec46
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %strided.vec49, %strided.vec
+  %4 = fmul fast <2 x float> %strided.vec48, %strided.vec46
+  %5 = fadd fast <2 x float> %3, %4
+  %6 = bitcast float* undef to <4 x float>*
+  %interleaved.vec = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x float> %interleaved.vec, <4 x float>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v8f32(float* %a, float* %b, float* %c) #0 {
+; CHECK-LABEL: complex_mul_v8f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r1, #16]
+; CHECK-NEXT:    vcmul.f32 q4, q1, q2, #0
+; CHECK-NEXT:    vcmla.f32 q4, q1, q2, #90
+; CHECK-NEXT:    vcmul.f32 q1, q0, q3, #0
+; CHECK-NEXT:    vcmla.f32 q1, q0, q3, #90
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r0]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast float* %a to <8 x float>*
+  %b.ptr = bitcast float* %b to <8 x float>*
+  %c.ptr = bitcast float* %c to <8 x float>*
+  %a.val = load <8 x float>, <8 x float>* %a.ptr
+  %b.val = load <8 x float>, <8 x float>* %b.ptr
+  %strided.vec = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec46 = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec48 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec49 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %strided.vec48, %strided.vec
+  %1 = fmul fast <4 x float> %strided.vec49, %strided.vec46
+  %2 = fsub fast <4 x float> %0, %1
+  %3 = fmul fast <4 x float> %strided.vec49, %strided.vec
+  %4 = fmul fast <4 x float> %strided.vec48, %strided.vec46
+  %5 = fadd fast <4 x float> %3, %4
+  %6 = bitcast float* undef to <8 x float>*
+  %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v16f32(float* %a, float* %b, float* %c) #0 {
+; CHECK-LABEL: complex_mul_v16f32:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .save {r4, r5}
+; CHECK-NEXT:    push {r4, r5}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vld20.32 {q2, q3}, [r1]
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    vld21.32 {q2, q3}, [r3]!
+; CHECK-NEXT:    vld21.32 {q0, q1}, [r2]!
+; CHECK-NEXT:    vld20.32 {q4, q5}, [r3]
+; CHECK-NEXT:    vld20.32 {q6, q7}, [r2]
+; CHECK-NEXT:    vld21.32 {q4, q5}, [r3]
+; CHECK-NEXT:    vld21.32 {q6, q7}, [r2]
+; CHECK-NEXT:    vstmia sp, {d4, d5, d6, d7} @ 32-byte Spill
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmul.f32 q4, q2, q7
+; CHECK-NEXT:    vneg.f32 q4, q4
+; CHECK-NEXT:    vfma.f32 q4, q5, q6
+; CHECK-NEXT:    vmul.f32 q5, q5, q7
+; CHECK-NEXT:    vfma.f32 q5, q2, q6
+; CHECK-NEXT:    vldmia sp, {d4, d5, d6, d7} @ 32-byte Reload
+; CHECK-NEXT:    vmul.f32 q6, q3, q1
+; CHECK-NEXT:    vneg.f32 q6, q6
+; CHECK-NEXT:    vmul.f32 q7, q2, q1
+; CHECK-NEXT:    vfma.f32 q6, q2, q0
+; CHECK-NEXT:    vfma.f32 q7, q3, q0
+; CHECK-NEXT:    vst20.32 {q6, q7}, [r0]
+; CHECK-NEXT:    vst21.32 {q6, q7}, [r0]
+; CHECK-NEXT:    vst20.32 {q4, q5}, [r0]
+; CHECK-NEXT:    vst21.32 {q4, q5}, [r0]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast float* %a to <16 x float>*
+  %b.ptr = bitcast float* %b to <16 x float>*
+  %c.ptr = bitcast float* %c to <16 x float>*
+  %a.val = load <16 x float>, <16 x float>* %a.ptr
+  %b.val = load <16 x float>, <16 x float>* %b.ptr
+  %strided.vec = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec46 = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %strided.vec48 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec49 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x float> %strided.vec48, %strided.vec
+  %1 = fmul fast <8 x float> %strided.vec49, %strided.vec46
+  %2 = fsub fast <8 x float> %0, %1
+  %3 = fmul fast <8 x float> %strided.vec49, %strided.vec
+  %4 = fmul fast <8 x float> %strided.vec48, %strided.vec46
+  %5 = fadd fast <8 x float> %3, %4
+  %6 = bitcast float* undef to <16 x float>*
+  %interleaved.vec = shufflevector <8 x float> %2, <8 x float> %5, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x float> %interleaved.vec, <16 x float>* %6, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS
+
+; NOTE: This statistic shouldn't appear, mve doesn't have f64 complex instructions
+; STATS-NOT: "complex-arithmetic.NumComplexIntrinsics"
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_mul_v2f64(double* %a, double* %b, double* %c) #0 {
+; CHECK-LABEL: complex_mul_v2f64:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vmul.f64 d4, d3, d1
+; CHECK-NEXT:    vmul.f64 d5, d2, d1
+; CHECK-NEXT:    vfnms.f64 d4, d2, d0
+; CHECK-NEXT:    vfma.f64 d5, d3, d0
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast double* %a to <2 x double>*
+  %b.ptr = bitcast double* %b to <2 x double>*
+  %c.ptr = bitcast double* %c to <2 x double>*
+  %a.val = load <2 x double>, <2 x double>* %a.ptr
+  %b.val = load <2 x double>, <2 x double>* %b.ptr
+  %strided.vec   = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> <i32 0>
+  %strided.vec46 = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> <i32 1>
+  %strided.vec48 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> <i32 0>
+  %strided.vec49 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x double> %strided.vec48, %strided.vec
+  %1 = fmul fast <1 x double> %strided.vec49, %strided.vec46
+  %2 = fsub fast <1 x double> %0, %1
+  %3 = fmul fast <1 x double> %strided.vec49, %strided.vec
+  %4 = fmul fast <1 x double> %strided.vec48, %strided.vec46
+  %5 = fadd fast <1 x double> %3, %4
+  %6 = bitcast double* undef to <2 x double>*
+  %interleaved.vec = shufflevector <1 x double> %2, <1 x double> %5, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %interleaved.vec, <2 x double>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v4f64(double* %a, double* %b, double* %c) #0 {
+; CHECK-LABEL: complex_mul_v4f64:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    vmul.f64 d4, d3, d1
+; CHECK-NEXT:    vmul.f64 d5, d2, d1
+; CHECK-NEXT:    vfnms.f64 d4, d2, d0
+; CHECK-NEXT:    vfma.f64 d5, d3, d0
+; CHECK-NEXT:    vmul.f64 d0, d9, d7
+; CHECK-NEXT:    vmul.f64 d1, d8, d7
+; CHECK-NEXT:    vfnms.f64 d0, d8, d6
+; CHECK-NEXT:    vfma.f64 d1, d9, d6
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast double* %a to <4 x double>*
+  %b.ptr = bitcast double* %b to <4 x double>*
+  %c.ptr = bitcast double* %c to <4 x double>*
+  %a.val = load <4 x double>, <4 x double>* %a.ptr
+  %b.val = load <4 x double>, <4 x double>* %b.ptr
+  %strided.vec   = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec46 = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec48 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec49 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x double> %strided.vec48, %strided.vec
+  %1 = fmul fast <2 x double> %strided.vec49, %strided.vec46
+  %2 = fsub fast <2 x double> %0, %1
+  %3 = fmul fast <2 x double> %strided.vec49, %strided.vec
+  %4 = fmul fast <2 x double> %strided.vec48, %strided.vec46
+  %5 = fadd fast <2 x double> %3, %4
+  %6 = bitcast double* undef to <4 x double>*
+  %interleaved.vec = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x double> %interleaved.vec, <4 x double>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v8f64(double* %a, double* %b, double* %c) #0 {
+; CHECK-LABEL: complex_mul_v8f64:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r1, #32]
+; CHECK-NEXT:    vmul.f64 d2, d5, d1
+; CHECK-NEXT:    vmul.f64 d3, d4, d1
+; CHECK-NEXT:    vfnms.f64 d2, d4, d0
+; CHECK-NEXT:    vfma.f64 d3, d5, d0
+; CHECK-NEXT:    vmul.f64 d0, d9, d7
+; CHECK-NEXT:    vmul.f64 d1, d8, d7
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q5, [r1, #48]
+; CHECK-NEXT:    vfnms.f64 d0, d8, d6
+; CHECK-NEXT:    vfma.f64 d1, d9, d6
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q6, [r1, #16]
+; CHECK-NEXT:    vmul.f64 d6, d11, d5
+; CHECK-NEXT:    vmul.f64 d7, d10, d5
+; CHECK-NEXT:    vfnms.f64 d6, d10, d4
+; CHECK-NEXT:    vfma.f64 d7, d11, d4
+; CHECK-NEXT:    vmul.f64 d4, d13, d9
+; CHECK-NEXT:    vmul.f64 d5, d12, d9
+; CHECK-NEXT:    vfnms.f64 d4, d12, d8
+; CHECK-NEXT:    vfma.f64 d5, d13, d8
+; CHECK-NEXT:    vstrw.32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast double* %a to <8 x double>*
+  %b.ptr = bitcast double* %b to <8 x double>*
+  %c.ptr = bitcast double* %c to <8 x double>*
+  %a.val = load <8 x double>, <8 x double>* %a.ptr
+  %b.val = load <8 x double>, <8 x double>* %b.ptr
+  %strided.vec = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec46 = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec48 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec49 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x double> %strided.vec48, %strided.vec
+  %1 = fmul fast <4 x double> %strided.vec49, %strided.vec46
+  %2 = fsub fast <4 x double> %0, %1
+  %3 = fmul fast <4 x double> %strided.vec49, %strided.vec
+  %4 = fmul fast <4 x double> %strided.vec48, %strided.vec46
+  %5 = fadd fast <4 x double> %3, %4
+  %6 = bitcast double* undef to <8 x double>*
+  %interleaved.vec = shufflevector <4 x double> %2, <4 x double> %5, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x double> %interleaved.vec, <8 x double>* %6, align 4
+  br label %vector.body
+}
+
+define void @complex_mul_v16f64(double* %a, double* %b, double* %c) #0 {
+; CHECK-LABEL: complex_mul_v16f64:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmul.f64 d0, d5, d3
+; CHECK-NEXT:    vmul.f64 d1, d4, d3
+; CHECK-NEXT:    vfnms.f64 d0, d4, d2
+; CHECK-NEXT:    vfma.f64 d1, d5, d2
+; CHECK-NEXT:    vldrw.u32 q4, [r1, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d0, d9, d7
+; CHECK-NEXT:    vmul.f64 d1, d8, d7
+; CHECK-NEXT:    vfnms.f64 d0, d8, d6
+; CHECK-NEXT:    vfma.f64 d1, d9, d6
+; CHECK-NEXT:    vldrw.u32 q6, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d0, d13, d11
+; CHECK-NEXT:    vmul.f64 d1, d12, d11
+; CHECK-NEXT:    vfnms.f64 d0, d12, d10
+; CHECK-NEXT:    vfma.f64 d1, d13, d10
+; CHECK-NEXT:    vmul.f64 d6, d15, d9
+; CHECK-NEXT:    vmul.f64 d7, d14, d9
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r1, #64]
+; CHECK-NEXT:    vfnms.f64 d6, d14, d8
+; CHECK-NEXT:    vfma.f64 d7, d15, d8
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
+; CHECK-NEXT:    vmul.f64 d8, d13, d11
+; CHECK-NEXT:    vmul.f64 d9, d12, d11
+; CHECK-NEXT:    vfnms.f64 d8, d12, d10
+; CHECK-NEXT:    vfma.f64 d9, d13, d10
+; CHECK-NEXT:    vmul.f64 d10, d3, d15
+; CHECK-NEXT:    vmul.f64 d11, d2, d15
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vfnms.f64 d10, d2, d14
+; CHECK-NEXT:    vfma.f64 d11, d3, d14
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [r1, #80]
+; CHECK-NEXT:    vmul.f64 d2, d5, d13
+; CHECK-NEXT:    vmul.f64 d3, d4, d13
+; CHECK-NEXT:    vfnms.f64 d2, d4, d12
+; CHECK-NEXT:    vfma.f64 d3, d5, d12
+; CHECK-NEXT:    vmul.f64 d4, d1, d15
+; CHECK-NEXT:    vmul.f64 d5, d0, d15
+; CHECK-NEXT:    vfnms.f64 d4, d0, d14
+; CHECK-NEXT:    vfma.f64 d5, d1, d14
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r0]
+; CHECK-NEXT:    vstrw.32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %a.ptr = bitcast double* %a to <16 x double>*
+  %b.ptr = bitcast double* %b to <16 x double>*
+  %c.ptr = bitcast double* %c to <16 x double>*
+  %a.val = load <16 x double>, <16 x double>* %a.ptr
+  %b.val = load <16 x double>, <16 x double>* %b.ptr
+  %strided.vec = shufflevector <16 x double> %a.val, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec46 = shufflevector <16 x double> %a.val, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %strided.vec48 = shufflevector <16 x double> %b.val, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.vec49 = shufflevector <16 x double> %b.val, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x double> %strided.vec48, %strided.vec
+  %1 = fmul fast <8 x double> %strided.vec49, %strided.vec46
+  %2 = fsub fast <8 x double> %0, %1
+  %3 = fmul fast <8 x double> %strided.vec49, %strided.vec
+  %4 = fmul fast <8 x double> %strided.vec48, %strided.vec46
+  %5 = fadd fast <8 x double> %3, %4
+  %6 = bitcast double* undef to <16 x double>*
+  %interleaved.vec = shufflevector <8 x double> %2, <8 x double> %5, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x double> %interleaved.vec, <16 x double>* %6, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_rotation_I() #0 {
+; CHECK-LABEL: complex_rotation_I:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #16
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vcadd.f32 q2, q0, q0, #90
+; CHECK-NEXT:    vcadd.f32 q0, q1, q1, #90
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x float> %strided.vec41, %strided.vec39
+  %1 = fadd fast <4 x float> %strided.vec42, %strided.vec
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}
+
+define void @complex_rotation_III() #0 {
+; CHECK-LABEL: complex_rotation_III:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #16
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vcadd.f32 q2, q0, q0, #270
+; CHECK-NEXT:    vcadd.f32 q0, q1, q1, #270
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fadd fast <4 x float> %strided.vec42, %strided.vec
+  %1 = fsub fast <4 x float> %strided.vec39, %strided.vec41
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}
diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -o - | FileCheck %s
+
+target triple = "arm-arm-none-eabi"
+attributes #0 = { "target-cpu"="cortex-m55" }
+
+define void @complex_rotation() #0 {
+; CHECK-LABEL: complex_rotation:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #16
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vcmul.f32 q2, q0, q0, #0
+; CHECK-NEXT:    vcmla.f32 q2, q0, q0, #90
+; CHECK-NEXT:    vcmul.f32 q0, q1, q1, #0
+; CHECK-NEXT:    vcmla.f32 q0, q1, q1, #90
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    b .LBB0_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %strided.vec41, %strided.vec
+  %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39
+  %2 = fsub fast <4 x float> %0, %1
+  %3 = fmul fast <4 x float> %strided.vec42, %strided.vec
+  %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39
+  %5 = fadd fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}
+
+define void @complex_rotation_I() #0 {
+; CHECK-LABEL: complex_rotation_I:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movs r1, #16
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s1, s6
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vmul.f32 q2, q1, q1
+; CHECK-NEXT:    vneg.f32 q3, q2
+; CHECK-NEXT:    vmul.f32 q2, q0, q1
+; CHECK-NEXT:    vneg.f32 q2, q2
+; CHECK-NEXT:    vfma.f32 q3, q0, q0
+; CHECK-NEXT:    vfms.f32 q2, q0, q1
+; CHECK-NEXT:    vst20.32 {q2, q3}, [r0]
+; CHECK-NEXT:    vst21.32 {q2, q3}, [r0]
+; CHECK-NEXT:    b .LBB1_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %strided.vec41, %strided.vec
+  %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39
+  %2 = fsub fast <4 x float> %0, %1
+  %3 = fneg fast <4 x float> %strided.vec
+  %4 = fmul fast <4 x float> %strided.vec42, %3
+  %5 = fmul fast <4 x float> %strided.vec41, %strided.vec39
+  %6 = fsub fast <4 x float> %4, %5
+  %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}
+
+
+define void @complex_rotation_II() #0 {
+; CHECK-LABEL: complex_rotation_II:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #16
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vcmul.f32 q2, q0, q0, #180
+; CHECK-NEXT:    vcmla.f32 q2, q0, q0, #270
+; CHECK-NEXT:    vcmul.f32 q0, q1, q1, #180
+; CHECK-NEXT:    vcmla.f32 q0, q1, q1, #270
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    b .LBB2_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec39.neg = fneg fast <4 x float> %strided.vec39
+  %0 = fmul fast <4 x float> %strided.vec41, %strided.vec39.neg
+  %1 = fmul fast <4 x float> %strided.vec42, %strided.vec
+  %2 = fsub fast <4 x float> %0, %1
+  %3 = fmul fast <4 x float> %strided.vec42, %strided.vec39
+  %4 = fmul fast <4 x float> %strided.vec41, %strided.vec
+  %5 = fsub fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}
+
+define void @complex_rotation_III() #0 {
+; CHECK-LABEL: complex_rotation_III:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movs r1, #16
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s1, s6
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vmul.f32 q2, q1, q1
+; CHECK-NEXT:    vneg.f32 q3, q2
+; CHECK-NEXT:    vmul.f32 q2, q0, q1
+; CHECK-NEXT:    vfma.f32 q3, q0, q0
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vst20.32 {q2, q3}, [r0]
+; CHECK-NEXT:    vst21.32 {q2, q3}, [r0]
+; CHECK-NEXT:    b .LBB3_1
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %store.ptr = bitcast { float, float }* null to <8 x float>*
+  %load.vec1 = load <8 x float>, <8 x float>* null, align 4
+  %load.vec2 = load <8 x float>, <8 x float>* null, align 4
+  %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %strided.vec41, %strided.vec
+  %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39
+  %2 = fsub fast <4 x float> %0, %1
+  %3 = fmul fast <4 x float> %strided.vec42, %strided.vec
+  %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39
+  %5 = fadd fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4
+  br label %vector.body
+}