diff --git a/llvm/include/llvm/CodeGen/ExpandComplex.h b/llvm/include/llvm/CodeGen/ExpandComplex.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/ExpandComplex.h
@@ -0,0 +1,22 @@
+//===---- ExpandComplex.h - Expand experimental complex intrinsics --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDCOMPLEX_H
+#define LLVM_CODEGEN_EXPANDCOMPLEX_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandComplexPass : public PassInfoMixin<ExpandComplexPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDCOMPLEX_H
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1244,6 +1244,9 @@
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
+  /// COMPLEX_MUL - Do a naive complex multiplication.
+  COMPLEX_MUL,
+
 // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID,
 #include "llvm/IR/VPIntrinsics.def"
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -472,6 +472,10 @@
   /// printing assembly.
   ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true);
 
+  /// This pass expands the experimental complex intrinsics into regular
+  /// floating-point arithmetic or calls to __mulsc3 (or similar) functions.
+  FunctionPass *createExpandComplexPass();
+
   /// This pass expands the experimental reduction intrinsics into sequences of
   /// shuffles.
   FunctionPass *createExpandReductionsPass();
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -652,6 +652,24 @@
     return false;
   }
 
+  /// Enum that specifies how a C complex type is lowered (in LLVM type terms).
+  enum class ComplexABI {
+    Memory,  ///< Indicates that a pointer to the struct is passed.
+    Vector,  ///< Indicates that T _Complex can be passed as <2 x T>.
+    Struct,  ///< Indicates that T _Complex can be passed as {T, T}.
+    Integer, ///< Indicates that an integer of the same size is passed.
+  };
+
+  /// Returns how a C complex type is lowered when used as the return value.
+  virtual ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const {
+    return ComplexABI::Struct;
+  }
+
+  /// Returns true if the target can match the @llvm.experimental.complex.fmul
+  /// intrinsic with the given type. Such an intrinsic is assumed will only be
+  /// matched when "complex-range" is "limited" or "no-nan".
+  virtual bool CustomLowerComplexMultiply(Type *FloatTy) const { return false; }
+
   /// Return if the target supports combining a
   /// chain like:
   /// \code
@@ -2515,6 +2533,7 @@
     case ISD::FMAXNUM_IEEE:
     case ISD::FMINIMUM:
     case ISD::FMAXIMUM:
+    case ISD::COMPLEX_MUL:
       return true;
     default: return false;
     }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -154,6 +154,7 @@
 void initializeEHContGuardCatchretPass(PassRegistry &);
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
 void initializeEntryExitInstrumenterPass(PassRegistry&);
+void initializeExpandComplexPass(PassRegistry &);
 void initializeExpandMemCmpPassPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -703,6 +703,7 @@
 def assertzext : SDNode<"ISD::AssertZext", SDT_assert>;
 def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>;
 
+def COMPLEX_MUL : SDNode<"ISD::COMPLEX_MUL", SDTFPBinOp, [SDNPCommutative]>;
 //===----------------------------------------------------------------------===//
 // Selection DAG Condition Codes
 
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -53,6 +53,7 @@
   EdgeBundles.cpp
   EHContGuardCatchret.cpp
   ExecutionDomainFix.cpp
+  ExpandComplex.cpp
   ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
diff --git a/llvm/lib/CodeGen/ExpandComplex.cpp b/llvm/lib/CodeGen/ExpandComplex.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/CodeGen/ExpandComplex.cpp
@@ -0,0 +1,293 @@
+//===-- ExpandComplex.cpp - Expand experimental complex intrinsics --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for complex intrinsics, allowing targets
+// to enable the intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandComplex.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+bool expandComplexInstruction(IntrinsicInst *CI, const TargetLowering *TLI,
+                              const DataLayout &DL) {
+  Intrinsic::ID Opcode = CI->getIntrinsicID();
+  assert((Opcode == Intrinsic::experimental_complex_fmul ||
+          Opcode == Intrinsic::experimental_complex_fdiv) &&
+         "Expected a complex instruction");
+
+  // Break the input values up into real and imaginary pieces.
+  Type *ComplexVectorTy = CI->getArgOperand(0)->getType();
+  Type *FloatTy = ComplexVectorTy->getScalarType();
+  IRBuilder<> Builder(CI);
+  Builder.setFastMathFlags(CI->getFastMathFlags());
+  Value *LhsR = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(0));
+  Value *LhsI = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(1));
+  Value *RhsR = nullptr, *RhsI = nullptr;
+  RhsR = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(0));
+  RhsI = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(1));
+
+  // The expansion has three pieces: the naive arithmetic, a possible prescaling
+  // (not relevant for multiplication), and a step to convert NaN output values
+  // to infinity values in certain situations (see Annex G of the C
+  // specification for more details). The "complex-range" attribute determines
+  // how many we need: "limited" has just the first one, "no-nan" the first two,
+  // and "full" for all three.
+
+  // Get the "complex-range" attribute, setting a default based on the presence
+  // of fast-math flags.
+  StringRef Range = CI->getFnAttr("complex-range").getValueAsString();
+  if (Range.empty()) {
+    Range = CI->getFastMathFlags().noNaNs() || CI->getFastMathFlags().noInfs()
+                ? "no-nan"
+                : "full";
+  }
+
+  // We can expand to naive arithmetic code if we only need the first piece. For
+  // multiplication, we can also accept "no-nan", since there is no semantic
+  // difference between "limited" and "no-nan" in that case.
+  bool CanExpand =
+      Range == "limited" ||
+      (Range == "no-nan" && Opcode == Intrinsic::experimental_complex_fmul);
+
+  Value *OutReal, *OutImag;
+  if (!CanExpand) {
+    // Do a call directly to the compiler-rt library here.
+    const char *Name = nullptr;
+    if (Opcode == Intrinsic::experimental_complex_fmul) {
+      if (FloatTy->isHalfTy())
+        Name = "__mulhc3";
+      else if (FloatTy->isFloatTy())
+        Name = "__mulsc3";
+      else if (FloatTy->isDoubleTy())
+        Name = "__muldc3";
+      else if (FloatTy->isX86_FP80Ty())
+        Name = "__mulxc3";
+      else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty())
+        Name = "__multc3";
+    } else if (Opcode == Intrinsic::experimental_complex_fdiv) {
+      if (FloatTy->isHalfTy())
+        Name = "__divhc3";
+      else if (FloatTy->isFloatTy())
+        Name = "__divsc3";
+      else if (FloatTy->isDoubleTy())
+        Name = "__divdc3";
+      else if (FloatTy->isX86_FP80Ty())
+        Name = "__divxc3";
+      else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty())
+        Name = "__divtc3";
+    }
+
+    if (!Name)
+      report_fatal_error("Cannot find libcall for intrinsic");
+
+    // The function we are to call is T complex __name(T, T, T, T) in C terms.
+    // Use TLI to figure out what the appropriate actual ABI for this function.
+    StructType *ComplexStructTy = StructType::get(FloatTy, FloatTy);
+    switch (TLI->getComplexReturnABI(FloatTy)) {
+    case TargetLowering::ComplexABI::Vector: {
+      // When the result is a vector type directly, we can replace the intrinsic
+      // with the call to the underlying function without any other munging.
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, ComplexVectorTy, FloatTy, FloatTy, FloatTy, FloatTy);
+      Value *NewResult = Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI});
+      CI->replaceAllUsesWith(NewResult);
+      CI->eraseFromParent();
+      return true;
+    }
+    case TargetLowering::ComplexABI::Integer: {
+      // This ABI form packs the type as a small struct in an integer register.
+      // All we need to do is move the integer to a vector register, without any
+      // other munging.
+      uint64_t Width = ComplexVectorTy->getPrimitiveSizeInBits().getFixedSize();
+      Type *IntegerTy = Builder.getIntNTy(Width);
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, IntegerTy, FloatTy, FloatTy, FloatTy, FloatTy);
+      Value *NewResult = Builder.CreateBitCast(
+          Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}), ComplexVectorTy);
+      CI->replaceAllUsesWith(NewResult);
+      CI->eraseFromParent();
+      return true;
+    }
+    case TargetLowering::ComplexABI::Memory: {
+      // Allocate a struct for the return type in the entry block. Stack slot
+      // coloring should remove duplicate allocations.
+      unsigned AllocaAS = DL.getAllocaAddrSpace();
+      Value *Alloca;
+      {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        BasicBlock *EntryBB = &CI->getParent()->getParent()->getEntryBlock();
+        Builder.SetInsertPoint(EntryBB, EntryBB->begin());
+        Alloca = Builder.CreateAlloca(ComplexStructTy, AllocaAS);
+      }
+
+      AttributeList Attrs;
+      Attrs = Attrs.addParamAttribute(
+          CI->getContext(), 0,
+          Attribute::getWithStructRetType(CI->getContext(), ComplexStructTy));
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, std::move(Attrs), Type::getVoidTy(CI->getContext()),
+          PointerType::get(ComplexStructTy, AllocaAS), FloatTy, FloatTy,
+          FloatTy, FloatTy);
+
+      Builder.CreateCall(Func, {Alloca, LhsR, LhsI, RhsR, RhsI});
+      OutReal = Builder.CreateLoad(
+          FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 0));
+      OutImag = Builder.CreateLoad(
+          FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 1));
+      break;
+    }
+    case TargetLowering::ComplexABI::Struct: {
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, ComplexStructTy, FloatTy, FloatTy, FloatTy, FloatTy);
+      Value *ComplexStructRes =
+          Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI});
+      OutReal = Builder.CreateExtractValue(ComplexStructRes, 0);
+      OutImag = Builder.CreateExtractValue(ComplexStructRes, 1);
+      break;
+    }
+    }
+  } else {
+    switch (Opcode) {
+    case Intrinsic::experimental_complex_fmul: {
+      // If the target has a complex_fmul expansion and the fast-math flag
+      // set, use that instead of expanding.
+      if (TLI->CustomLowerComplexMultiply(ComplexVectorTy)) {
+        return false;
+      }
+
+      OutReal = Builder.CreateFSub(Builder.CreateFMul(LhsR, RhsR),
+                                   Builder.CreateFMul(LhsI, RhsI));
+      OutImag = Builder.CreateFAdd(Builder.CreateFMul(LhsI, RhsR),
+                                   Builder.CreateFMul(LhsR, RhsI));
+      break;
+    }
+    case Intrinsic::experimental_complex_fdiv: {
+      Value *Scale = Builder.CreateFAdd(Builder.CreateFMul(RhsR, RhsR),
+                                        Builder.CreateFMul(RhsI, RhsI));
+      OutReal =
+          Builder.CreateFDiv(Builder.CreateFAdd(Builder.CreateFMul(LhsR, RhsR),
+                                                Builder.CreateFMul(LhsI, RhsI)),
+                             Scale);
+      OutImag =
+          Builder.CreateFDiv(Builder.CreateFSub(Builder.CreateFMul(LhsI, RhsR),
+                                                Builder.CreateFMul(LhsR, RhsI)),
+                             Scale);
+      break;
+    }
+    }
+  }
+
+  // Replace all of the uses of the intrinsic with OutReal/OutImag. We avoid
+  // creating the vector unless we have to.
+  bool HasVectorUse = false;
+  for (User *U : CI->users()) {
+    uint64_t Index;
+    if (match(U, m_ExtractElt(m_Value(), m_ConstantInt(Index)))) {
+      assert((Index == 0 || Index == 1) && "Extract element too small");
+      U->replaceAllUsesWith(Index == 0 ? OutReal : OutImag);
+    } else {
+      HasVectorUse = true;
+    }
+  }
+
+  if (HasVectorUse) {
+    Value *OutComplex = Builder.CreateInsertElement(
+        Builder.CreateInsertElement(UndefValue::get(ComplexVectorTy), OutReal,
+                                    uint64_t(0)),
+        OutImag, uint64_t(1));
+    CI->replaceAllUsesWith(OutComplex);
+  } else {
+    CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool expandComplexIntrinsics(Function &F, const TargetLowering *TLI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst *, 4> Worklist;
+  for (auto &I : instructions(F)) {
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      default:
+        break;
+      case Intrinsic::experimental_complex_fmul:
+      case Intrinsic::experimental_complex_fdiv:
+        Worklist.push_back(II);
+        break;
+      }
+    }
+  }
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (auto *II : Worklist) {
+    Changed |= expandComplexInstruction(II, TLI, DL);
+  }
+  return Changed;
+}
+
+class ExpandComplex : public FunctionPass {
+public:
+  static char ID;
+  ExpandComplex() : FunctionPass(ID) {
+    initializeExpandComplexPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const TargetMachine *TM =
+        &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
+    const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
+    return expandComplexIntrinsics(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+char ExpandComplex::ID;
+INITIALIZE_PASS_BEGIN(ExpandComplex, "expand-complex",
+                      "Expand complex intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(ExpandComplex, "expand-complex",
+                    "Expand complex intrinsics", false, false)
+
+FunctionPass *llvm::createExpandComplexPass() { return new ExpandComplex(); }
+
+PreservedAnalyses ExpandComplexPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  /*const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();*/
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1058,6 +1058,7 @@
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
+  case ISD::COMPLEX_MUL:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
@@ -3287,6 +3288,7 @@
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
+  case ISD::COMPLEX_MUL:
   // Vector-predicated binary op widening. Note that -- unlike the
   // unpredicated versions -- we don't have to worry about trapping on
   // operations like UDIV, FADD, etc., as we pass on the original vector
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7186,6 +7186,12 @@
   case Intrinsic::experimental_vector_splice:
     visitVectorSplice(I);
     return;
+  case Intrinsic::experimental_complex_fmul:
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::COMPLEX_MUL, sdl, ResultVT,
+                             getValue(I.getOperand(0)),
+                             getValue(I.getOperand(1)), Flags));
+    return;
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,6 +480,7 @@
   case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
   case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
   case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
+  case ISD::COMPLEX_MUL:                return "complex_mul";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -922,6 +922,10 @@
   // Allow disabling it for testing purposes.
   if (!DisableExpandReductions)
     addPass(createExpandReductionsPass());
+
+  // If the target doesn't support complex intrinsics, or if they need to be
+  // expanded into more complex calls, generate the expansion to complex calls.
+  addPass(createExpandComplexPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -979,6 +979,8 @@
     /// legal as the hook is used before type legalization.
     bool isSafeMemOpType(MVT VT) const override;
 
+    ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const override;
+
     /// Returns true if the target allows unaligned memory accesses of the
     /// specified type. Returns whether it is "fast" in the last argument.
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
@@ -1466,6 +1468,8 @@
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
+    bool CustomLowerComplexMultiply(Type *FloatTy) const override;
+
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
     bool lowerInterleavedLoad(LoadInst *LI,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2118,6 +2118,22 @@
     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
   }
 
+  if (Subtarget.hasFP16()) {
+    for (auto VT : {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}) {
+      if (Subtarget.hasVLX())
+        setOperationAction(ISD::COMPLEX_MUL, VT, Custom);
+      setOperationAction(ISD::COMPLEX_MUL, MVT::v32f16, Custom);
+    }
+  }
+  if (Subtarget.hasAnyFMA() || (Subtarget.hasAVX512() && Subtarget.hasVLX())) {
+    for (auto VT : {MVT::v2f32, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64})
+      setOperationAction(ISD::COMPLEX_MUL, VT, Custom);
+  }
+  if (Subtarget.hasAVX512()) {
+    setOperationAction(ISD::COMPLEX_MUL, MVT::v8f64, Custom);
+    setOperationAction(ISD::COMPLEX_MUL, MVT::v16f32, Custom);
+  }
+
   if (Subtarget.hasAMXTILE()) {
     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
   }
@@ -2566,6 +2582,44 @@
   return MVT::i32;
 }
 
+TargetLoweringBase::ComplexABI
+X86TargetLowering::getComplexReturnABI(Type *ScalarFloatTy) const {
+  // Windows ABIs don't have dedicated _Complex rules, so they work as regular
+  // structs. These return as integers if the size is 8 bytes or fewer, or
+  // structs via memory if larger. (The size threshold is the same for both
+  // 32 and 64-bit ABIs).
+  if (Subtarget.isOSWindows()) {
+    unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize();
+    if (FloatSize <= 32) {
+      return ComplexABI::Integer;
+    } else {
+      return ComplexABI::Memory;
+    }
+  }
+  if (Subtarget.is32Bit()) {
+    if (ScalarFloatTy->isFloatTy()) {
+      return ComplexABI::Integer;
+    } else if (ScalarFloatTy->isHalfTy()) {
+      return ComplexABI::Vector;
+    } else {
+      return ComplexABI::Memory;
+    }
+  } else {
+    // The x86-64 ABI specifies that (save for x86-fp80), this is handled as a
+    // regular C struct. This means that float and smaller get packed into a
+    // single vector in xmm0; double and x86-fp80 (by special case) return two
+    // values; and larger types than x86-fp80 (i.e., fp128) returns via memory.
+    unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize();
+    if (FloatSize <= 32) {
+      return ComplexABI::Vector;
+    } else if (FloatSize <= 80) {
+      return ComplexABI::Struct;
+    } else {
+      return ComplexABI::Memory;
+    }
+  }
+}
+
 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   if (VT == MVT::f32)
     return X86ScalarSSEf32;
@@ -31578,6 +31632,68 @@
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
 }
 
+bool X86TargetLowering::CustomLowerComplexMultiply(Type *FloatTy) const {
+  auto VecTy = cast<FixedVectorType>(FloatTy);
+  unsigned VecSize = VecTy->getNumElements() * VecTy->getScalarSizeInBits();
+  Type *ElementTy = VecTy->getElementType();
+  if (ElementTy->isHalfTy()) {
+    // All the half type need avx512fp16 enabled.
+    if (VecSize == 512)
+      // For 512-bt vector type, just avx512fp16 needed.
+      return Subtarget.hasFP16();
+    else
+      // 128-bit, 256-bit vector type are legal and other vector type can
+      // be widened or split. AVX512VL should be enabled.
+      return Subtarget.hasFP16() && Subtarget.hasVLX();
+  }
+  if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
+    if (VecSize == 512)
+      // For 512-bt vector type, they are legal or can be split.
+      return Subtarget.hasAVX512() || Subtarget.hasAnyFMA();
+    // 128-bit, 256-bit vector type are legal or and other type can
+    // be widened or split.
+    return Subtarget.hasAnyFMA() ||
+           (Subtarget.hasAVX512() && Subtarget.hasVLX());
+  }
+  return false;
+}
+
+static SDValue LowerComplexMUL(SDValue Op, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  MVT ElementTy = VT.getScalarType();
+  SDLoc DL(Op);
+  // Custom handling for half type since we have corresponding complex half
+  // multiply instructions.
+  // FIXME: We use vfmulcph for sclar complex multiply here, use vfmulcsh
+  // instead.
+  if (ElementTy == MVT::f16) {
+    // Transform llvm.experimental.complex.fmul.vxf16 to vfmulcph instruction.
+    MVT BitCastTy = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
+    SDValue LHS = DAG.getNode(ISD::BITCAST, DL, BitCastTy, Op.getOperand(0));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, DL, BitCastTy, Op.getOperand(1));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::VFMULC, DL, BitCastTy, LHS, RHS));
+  }
+  assert((ElementTy == MVT::f32 || ElementTy == MVT::f64) &&
+         "Unexpected element type");
+  // llvm.experimental.complex.fmul.vxf{32,64} are transformed to SHUFFLE and
+  // FMA instructions.
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Imm = ElementTy == MVT::SimpleValueType::f32 ? 0xb1 : 0x55;
+  SDValue V1, V2, V3, V4;
+  // Swap vcetor elements in pairs. E.g: [1,2,3,4] ---> [2,1,4,3]
+  V1 = DAG.getNode(X86ISD::VPERMILPI, DL, VT, LHS,
+                   DAG.getTargetConstant(Imm, DL, MVT::i8));
+  // Duplicate the odd index elements, which is real part.
+  V2 = DAG.getNode(X86ISD::MOVSHDUP, DL, VT, RHS);
+  V3 = DAG.getNode(ISD::FMUL, DL, VT, V1, V2);
+  // Duplicate the evem index elements, which is imaginary part.
+  V4 = DAG.getNode(X86ISD::MOVSLDUP, DL, VT, RHS);
+  return DAG.getNode(X86ISD::FMADDSUB, DL, VT, LHS, V4, V3);
+}
+
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -31721,6 +31837,7 @@
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
+  case ISD::COMPLEX_MUL:        return LowerComplexMUL(Op, DAG, Subtarget);
   }
 }
 
@@ -32723,6 +32840,22 @@
     // to move the scalar in two i32 pieces.
     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
     return;
+  case ISD::COMPLEX_MUL:
+    // Widen the vector size smaller than 128 to 128
+    MVT VT = N->getSimpleValueType(0);
+    // FIXME: (COMPLEX_MUL v2f16, v2f16) should be lowered to VFMULCSH but we
+    // mix the v2f16 and v4f16 here.
+    assert(VT == MVT::v2f32 || VT == MVT::v2f16 ||
+           VT == MVT::v4f16 && "Unexpected Value type of COMPLEX_MUL!");
+    MVT WideVT =
+        VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+    SmallVector<SDValue, 4> Ops(VT == MVT::v2f16 ? 4 : 2, DAG.getUNDEF(VT));
+    Ops[0] = N->getOperand(0);
+    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
+    Ops[0] = N->getOperand(1);
+    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
+    Results.push_back(DAG.getNode(N->getOpcode(), dl, WideVT, LHS, RHS));
+    return;
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -29,6 +29,7 @@
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:       Expand complex intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
 ; CHECK-NEXT:       Safe Stack instrumentation pass
diff --git a/llvm/test/CodeGen/X86/complex-32bit.ll b/llvm/test/CodeGen/X86/complex-32bit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-32bit.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 32-bit.
+
+declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>)
+declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>)
+
+define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    calll __mulhc3@PLT
+; CHECK-NEXT:    addl $24, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -24
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $28, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    calll __mulsc3@PLT
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl $28, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __muldc3@PLT
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    addl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) {
+; CHECK-LABEL: intrinsic_f80:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $92, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __mulxc3@PLT
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w)
+  ret <2 x x86_fp80> %mul
+}
+
+define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) {
+; CHECK-LABEL: intrinsic_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    subl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset %esi, -20
+; CHECK-NEXT:    .cfi_offset %edi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 12
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    calll __multc3@PLT
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-NEXT:    addl $76, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -76
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, 28(%esi)
+; CHECK-NEXT:    movl %eax, 24(%esi)
+; CHECK-NEXT:    movl %ebp, 20(%esi)
+; CHECK-NEXT:    movl %ebx, 16(%esi)
+; CHECK-NEXT:    movl %edi, 12(%esi)
+; CHECK-NEXT:    movl %edx, 8(%esi)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 4(%esi)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, (%esi)
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl $4
+  %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w)
+  ret <2 x fp128> %mul
+}
+
diff --git a/llvm/test/CodeGen/X86/complex-64bit.ll b/llvm/test/CodeGen/X86/complex-64bit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-64bit.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 32-bit.
+
+declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>)
+declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>)
+
+define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __mulsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; CHECK-NEXT:    callq __muldc3@PLT
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) {
+; CHECK-LABEL: intrinsic_f80:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt (%rsp)
+; CHECK-NEXT:    callq __mulxc3@PLT
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w)
+  ret <2 x x86_fp80> %mul
+}
+
+define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) {
+; CHECK-LABEL: intrinsic_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movq %rsp, %rdi
+; CHECK-NEXT:    callq __multc3@PLT
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w)
+  ret <2 x fp128> %mul
+}
+
diff --git a/llvm/test/CodeGen/X86/complex-divide.ll b/llvm/test/CodeGen/X86/complex-divide.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-divide.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Check the expansion of the complex divide intrinsic. This only tests
+; expansion for 32-bit floats, as the expansion should produce identical IR
+; expansions save for the ABI of calling __divsc3, which is tested (indirectly)
+; for each type individually in complex-{32,64}bit.ll.
+
+declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float>, <2 x float>)
+
+; Generate a call to __divsc3
+define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_slow_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __divsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %div
+}
+
+; Do not do an expansion (because fast is not sufficient to imply full
+; complex-range=limited.
+define <2 x float> @intrinsic_implied_not_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_implied_not_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovaps %xmm1, %xmm2
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-NEXT:    callq __divsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %div
+}
+
+; Do an expansion (because of complex-range=limited)
+define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm4
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm5
+; CHECK-NEXT:    vdivss %xmm4, %xmm5, %xmm5
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vdivss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3]
+; CHECK-NEXT:    retq
+  %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %div
+}
+
+; Do an expansion, and use the FMA (because of fast-math flags).
+define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_fast_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-NEXT:    vmulss %xmm3, %xmm3, %xmm4
+; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm4 = (xmm1 * xmm1) + xmm4
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm5
+; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm5 = (xmm0 * xmm1) + xmm5
+; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vdivss %xmm4, %xmm6, %xmm4
+; CHECK-NEXT:    vmulss %xmm4, %xmm5, %xmm5
+; CHECK-NEXT:    vmulss %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm2 * xmm1) - xmm0
+; CHECK-NEXT:    vmulss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3]
+; CHECK-NEXT:    retq
+  %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %div
+}
+
+attributes #0 = { "complex-range"="limited" }
+attributes #1 = { "target-features"="+fma" }
+attributes #2 = { "complex-range"="no-nan" }
diff --git a/llvm/test/CodeGen/X86/complex-multiply.ll b/llvm/test/CodeGen/X86/complex-multiply.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-multiply.ll
@@ -0,0 +1,525 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefixes=ALL,FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL
+
+
+; Check the expansion of the complex multiply intrinsic. This only tests
+; expansion for 32-bit floats, as the expansion should produce identical IR
+; expansions save for ABI of calling __mulsc3, which is tested for each type
+; individually in complex-{32,64}bit.ll.
+
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float>, <16 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double>, <8 x double>)
+declare <6 x float> @llvm.experimental.complex.fmul.v6f32(<6 x float>, <6 x float>)
+declare <6 x double> @llvm.experimental.complex.fmul.v6f64(<6 x double>, <6 x double>)
+declare <32 x float> @llvm.experimental.complex.fmul.v32f32(<32 x float>, <32 x float>)
+
+; Generate a call to __mulsc3
+define <2 x float> @intrinsic_slow_v2f32(<2 x float> %z, <2 x float> %w) {
+; ALL-LABEL: intrinsic_slow_v2f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    vmovaps %xmm1, %xmm2
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; ALL-NEXT:    callq __mulsc3@PLT
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+; Do an expansion (because of fast-math flags).
+define <2 x float> @intrinsic_implied_limited_v2f32(<2 x float> %z, <2 x float> %w)  {
+; ALL-LABEL: intrinsic_implied_limited_v2f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; ALL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call nnan ninf <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+; Do an expansion (because of complex-range=limited).
+define <2 x float> @intrinsic_limited_v2f32(<2 x float> %z, <2 x float> %w) {
+; ALL-LABEL: intrinsic_limited_v2f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; ALL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %mul
+}
+
+; Do an expansion, and use the FMA (because of fast-math flags).
+define <2 x float> @intrinsic_fast_v2f32(<2 x float> %z, <2 x float> %w) {
+; ALL-LABEL: intrinsic_fast_v2f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; ALL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call fast <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+define <4 x float> @intrinsic_slow_v4f32(<4 x float> %z, <4 x float> %w) {
+; ALL-LABEL: intrinsic_slow_v4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    vmovaps %xmm1, %xmm2
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; ALL-NEXT:    callq __mulsc3@PLT
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
+  %mul = call <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w)
+  ret <4 x float> %mul
+}
+
+define <4 x float> @intrinsic_fast_v4f32(<4 x float> %z, <4 x float> %w) {
+; ALL-LABEL: intrinsic_fast_v4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; ALL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call fast <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w)
+  ret <4 x float> %mul
+}
+
+define <4 x float> @intrinsic_limited_v4f32(<4 x float> %z, <4 x float> %w) {
+; ALL-LABEL: intrinsic_limited_v4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; ALL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w) #0
+  ret <4 x float> %mul
+}
+
+define <8 x float> @intrinsic_slow_v8f32(<8 x float> %z, <8 x float> %w) {
+; ALL-LABEL: intrinsic_slow_v8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    vmovaps %ymm1, %ymm2
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT:    # kill: def $xmm2 killed $xmm2 killed $ymm2
+; ALL-NEXT:    callq __mulsc3@PLT
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
+  %mul = call <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w)
+  ret <8 x float> %mul
+}
+
+define <8 x float> @intrinsic_fast_v8f32(<8 x float> %z, <8 x float> %w) {
+; ALL-LABEL: intrinsic_fast_v8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT:    vmulps %ymm2, %ymm3, %ymm2
+; ALL-NEXT:    vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
+; ALL-NEXT:    retq
+  %mul = call fast <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w)
+  ret <8 x float> %mul
+}
+
+define <8 x float> @intrinsic_limited_v8f32(<8 x float> %z, <8 x float> %w) {
+; ALL-LABEL: intrinsic_limited_v8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT:    vmulps %ymm2, %ymm3, %ymm2
+; ALL-NEXT:    vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
+; ALL-NEXT:    retq
+  %mul = call <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w) #0
+  ret <8 x float> %mul
+}
+
+define <16 x float> @intrinsic_slow_v16f32(<16 x float> %z, <16 x float> %w) {
+; FMA-LABEL: intrinsic_slow_v16f32:
+; FMA:       # %bb.0:
+; FMA-NEXT:    pushq %rax
+; FMA-NEXT:    .cfi_def_cfa_offset 16
+; FMA-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; FMA-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; FMA-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FMA-NEXT:    # kill: def $xmm2 killed $xmm2 killed $ymm2
+; FMA-NEXT:    callq __mulsc3@PLT
+; FMA-NEXT:    popq %rax
+; FMA-NEXT:    .cfi_def_cfa_offset 8
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_slow_v16f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    .cfi_def_cfa_offset 16
+; AVX512VL-NEXT:    vmovaps %zmm1, %zmm2
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VL-NEXT:    # kill: def $xmm2 killed $xmm2 killed $zmm2
+; AVX512VL-NEXT:    callq __mulsc3@PLT
+; AVX512VL-NEXT:    popq %rax
+; AVX512VL-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-NEXT:    retq
+  %mul = call <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w)
+  ret <16 x float> %mul
+}
+
+
+define <16 x float> @intrinsic_fast_v16f32(<16 x float> %z, <16 x float> %w) {
+; FMA-LABEL: intrinsic_fast_v16f32:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm4 = ymm2[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm5 = ymm0[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm3[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_fast_v16f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT:    vpermilps {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512VL-NEXT:    vmulps %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vmovsldup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX512VL-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call fast <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w)
+  ret <16 x float> %mul
+}
+
+define <16 x float> @intrinsic_limited_v16f32(<16 x float> %z, <16 x float> %w) {
+; FMA-LABEL: intrinsic_limited_v16f32:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm4 = ymm2[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm5 = ymm0[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm3[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_limited_v16f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT:    vpermilps {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512VL-NEXT:    vmulps %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vmovsldup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX512VL-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w) #0
+  ret <16 x float> %mul
+}
+
+define <2 x double> @intrinsic_slow_v2f64(<2 x double> %z, <2 x double> %w) {
+; ALL-LABEL: intrinsic_slow_v2f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    vmovapd %xmm1, %xmm2
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; ALL-NEXT:    callq __muldc3@PLT
+; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x double> @intrinsic_fast_v2f64(<2 x double> %z, <2 x double> %w) {
+; ALL-LABEL: intrinsic_fast_v2f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,1]
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; ALL-NEXT:    vmulpd %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; ALL-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x double> @intrinsic_limited_v2f64(<2 x double> %z, <2 x double> %w) {
+; ALL-LABEL: intrinsic_limited_v2f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,1]
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; ALL-NEXT:    vmulpd %xmm2, %xmm3, %xmm2
+; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; ALL-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
+; ALL-NEXT:    retq
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) #0
+  ret <2 x double> %mul
+}
+
+define <4 x double> @intrinsic_slow_v4f64(<4 x double> %z, <4 x double> %w) {
+; ALL-LABEL: intrinsic_slow_v4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    vmovapd %ymm1, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT:    # kill: def $xmm2 killed $xmm2 killed $ymm2
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    callq __muldc3@PLT
+; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    .cfi_def_cfa_offset 8
+; ALL-NEXT:    retq
+  %mul = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w)
+  ret <4 x double> %mul
+}
+
+define <4 x double> @intrinsic_fast_v4f64(<4 x double> %z, <4 x double> %w) {
+; ALL-LABEL: intrinsic_fast_v4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,0,3,2]
+; ALL-NEXT:    vmulpd %ymm2, %ymm3, %ymm2
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
+; ALL-NEXT:    retq
+  %mul = call fast <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w)
+  ret <4 x double> %mul
+}
+
+define <4 x double> @intrinsic_limited_v4f64(<4 x double> %z, <4 x double> %w) {
+; ALL-LABEL: intrinsic_limited_v4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,0,3,2]
+; ALL-NEXT:    vmulpd %ymm2, %ymm3, %ymm2
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
+; ALL-NEXT:    retq
+  %mul = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w) #0
+  ret <4 x double> %mul
+}
+
+define <8 x double> @intrinsic_slow_v8f64(<8 x double> %z, <8 x double> %w) {
+; FMA-LABEL: intrinsic_slow_v8f64:
+; FMA:       # %bb.0:
+; FMA-NEXT:    pushq %rax
+; FMA-NEXT:    .cfi_def_cfa_offset 16
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; FMA-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FMA-NEXT:    # kill: def $xmm2 killed $xmm2 killed $ymm2
+; FMA-NEXT:    vzeroupper
+; FMA-NEXT:    callq __muldc3@PLT
+; FMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; FMA-NEXT:    popq %rax
+; FMA-NEXT:    .cfi_def_cfa_offset 8
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_slow_v8f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    .cfi_def_cfa_offset 16
+; AVX512VL-NEXT:    vmovapd %zmm1, %zmm2
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VL-NEXT:    # kill: def $xmm2 killed $xmm2 killed $zmm2
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq __muldc3@PLT
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    popq %rax
+; AVX512VL-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-NEXT:    retq
+  %mul = call <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w)
+  ret <8 x double> %mul
+}
+
+define <8 x double> @intrinsic_fast_v8f64(<8 x double> %z, <8 x double> %w) {
+; FMA-LABEL: intrinsic_fast_v8f64:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm2[1,1,3,3]
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,0,3,2]
+; FMA-NEXT:    vmulpd %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm3[1,1,3,3]
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2]
+; FMA-NEXT:    vmulpd %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vmovddup {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_fast_v8f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6]
+; AVX512VL-NEXT:    vmulpd %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6]
+; AVX512VL-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call fast <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w)
+  ret <8 x double> %mul
+}
+
+define <8 x double> @intrinsic_limited_v8f64(<8 x double> %z, <8 x double> %w) {
+; FMA-LABEL: intrinsic_limited_v8f64:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm2[1,1,3,3]
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,0,3,2]
+; FMA-NEXT:    vmulpd %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm3[1,1,3,3]
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2]
+; FMA-NEXT:    vmulpd %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vmovddup {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_limited_v8f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6]
+; AVX512VL-NEXT:    vmulpd %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6]
+; AVX512VL-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w) #0
+  ret <8 x double> %mul
+}
+
+define <6 x float> @intrinsic_fast_v6f32(<6 x float> %z, <6 x float> %w) {
+; ALL-LABEL: intrinsic_fast_v6f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT:    vmulps %ymm2, %ymm3, %ymm2
+; ALL-NEXT:    vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
+; ALL-NEXT:    retq
+  %mul = call fast <6 x float> @llvm.experimental.complex.fmul.v6f32(<6 x float> %z, <6 x float> %w)
+  ret <6 x float> %mul
+}
+
+define <6 x double> @intrinsic_fast_v6f64(<6 x double> %z, <6 x double> %w) {
+; FMA-LABEL: intrinsic_fast_v6f64:
+; FMA:       # %bb.0:
+; FMA-NEXT:    movq %rdi, %rax
+; FMA-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; FMA-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; FMA-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; FMA-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm7[0]
+; FMA-NEXT:    vinsertf128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2]
+; FMA-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm1[1,1,3,3]
+; FMA-NEXT:    vmulpd %ymm3, %ymm2, %ymm2
+; FMA-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm0 * ymm1) +/- ymm2
+; FMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm4[0],xmm5[0]
+; FMA-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm5[0],xmm4[0]
+; FMA-NEXT:    vmovapd {{[0-9]+}}(%rsp), %xmm3
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,1]
+; FMA-NEXT:    vmulpd %xmm4, %xmm2, %xmm2
+; FMA-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
+; FMA-NEXT:    vfmaddsub213pd {{.*#+}} xmm3 = (xmm0 * xmm3) +/- xmm2
+; FMA-NEXT:    vmovapd %xmm3, 32(%rdi)
+; FMA-NEXT:    vmovapd %ymm1, (%rdi)
+; FMA-NEXT:    vzeroupper
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_fast_v6f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6]
+; AVX512VL-NEXT:    vmulpd %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6]
+; AVX512VL-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call fast <6 x double> @llvm.experimental.complex.fmul.v6f64(<6 x double> %z, <6 x double> %w)
+  ret <6 x double> %mul
+}
+
+; Test the vector bigger than 512 bits.
+define <32 x float> @intrinsic_fast_v32f32(<32 x float> %z, <32 x float> %w) {
+; FMA-LABEL: intrinsic_fast_v32f32:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm8 = ymm4[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm9 = ymm0[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm8, %ymm9, %ymm8
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm4 = ymm4[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) +/- ymm8
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm4 = ymm5[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm8 = ymm1[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm4, %ymm8, %ymm4
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm5 = ymm5[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) +/- ymm4
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm4 = ymm6[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm5 = ymm2[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm5 = ymm6[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm2 = (ymm5 * ymm2) +/- ymm4
+; FMA-NEXT:    vmovshdup {{.*#+}} ymm4 = ymm7[1,1,3,3,5,5,7,7]
+; FMA-NEXT:    vpermilps {{.*#+}} ymm5 = ymm3[1,0,3,2,5,4,7,6]
+; FMA-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; FMA-NEXT:    vmovsldup {{.*#+}} ymm5 = ymm7[0,0,2,2,4,4,6,6]
+; FMA-NEXT:    vfmaddsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) +/- ymm4
+; FMA-NEXT:    retq
+;
+; AVX512VL-LABEL: intrinsic_fast_v32f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} zmm4 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT:    vpermilps {{.*#+}} zmm5 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512VL-NEXT:    vmulps %zmm4, %zmm5, %zmm4
+; AVX512VL-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm2[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX512VL-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) +/- zmm4
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm3[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT:    vpermilps {{.*#+}} zmm4 = zmm1[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512VL-NEXT:    vmulps %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    vmovsldup {{.*#+}} zmm3 = zmm3[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX512VL-NEXT:    vfmaddsub213ps {{.*#+}} zmm1 = (zmm3 * zmm1) +/- zmm2
+; AVX512VL-NEXT:    retq
+  %mul = call fast <32 x float> @llvm.experimental.complex.fmul.v32f32(<32 x float> %z, <32 x float> %w)
+  ret <32 x float> %mul
+}
+
+attributes #0 = { "complex-range"="limited" }
diff --git a/llvm/test/CodeGen/X86/complex-win32.ll b/llvm/test/CodeGen/X86/complex-win32.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-win32.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-windows-msvc | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 32-bit
+; windows API. Compiler-rt only includes mulsc3/muldc3, so we only test those.
+
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $24, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    calll ___mulsc3
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl $24, %esp
+; CHECK-NEXT:    retl
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $56, %esp
+; CHECK-NEXT:    fldl 8(%ebp)
+; CHECK-NEXT:    fldl 16(%ebp)
+; CHECK-NEXT:    fldl 24(%ebp)
+; CHECK-NEXT:    fldl 32(%ebp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll ___muldc3
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
diff --git a/llvm/test/CodeGen/X86/complex-win64.ll b/llvm/test/CodeGen/X86/complex-win64.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-win64.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 64-bit
+; windows API. Compiler-rt only includes mulsc3/muldc3, so we only test those.
+
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) nounwind {
+; CHECK-LABEL: intrinsic_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps (%rdx), %xmm2
+; CHECK-NEXT:    movaps (%rcx), %xmm0
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __mulsc3
+; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) nounwind {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps (%rdx), %xmm3
+; CHECK-NEXT:    movaps (%rcx), %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; CHECK-NEXT:    movhps %xmm3, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    callq __muldc3
+; CHECK-NEXT:    movups {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
diff --git a/llvm/test/CodeGen/X86/fp16-complex-multiply.ll b/llvm/test/CodeGen/X86/fp16-complex-multiply.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-complex-multiply.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s
+
+declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>)
+declare <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half>, <4 x half>)
+declare <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half>, <16 x half>)
+declare <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half>, <32 x half>)
+declare <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half>, <20 x half>)
+declare <64 x half> @llvm.experimental.complex.fmul.v64f16(<64 x half>, <64 x half>)
+
+; FIXME: llvm.experimental.complex.fmul.v2f16 should be lowered to vfmulcsh
+define <2 x half> @intrinsic_fast_v2f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call fast <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+define <4 x half> @intrinsic_fast_v4f16(<4 x half> %z, <4 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call fast <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w)
+  ret <4 x half> %mul
+}
+
+define <8 x half> @intrinsic_fast_v8f16(<8 x half> %z, <8 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call fast <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w)
+  ret <8 x half> %mul
+}
+
+define <16 x half> @intrinsic_fast_v16f16(<16 x half> %z, <16 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %ymm1, %ymm0, %ymm2
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %mul = call fast <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w)
+  ret <16 x half> %mul
+}
+
+define <32 x half> @intrinsic_fast_v32f16(<32 x half> %z, <32 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mul = call fast <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w)
+  ret <32 x half> %mul
+}
+
+define <20 x half> @intrinsic_fast_v20f16(<20 x half> %z, <20 x half> %w) {
+; CHECK-LABEL: intrinsic_fast_v20f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mul = call fast <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half> %z, <20 x half> %w)
+  ret <20 x half> %mul
+}
+
+define <2 x half> @intrinsic_limited_v2f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) #0
+  ret <2 x half> %mul
+}
+
+define <4 x half> @intrinsic_limited_v4f16(<4 x half> %z, <4 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w) #0
+  ret <4 x half> %mul
+}
+
+define <8 x half> @intrinsic_limited_v8f16(<8 x half> %z, <8 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mul = call <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w) #0
+  ret <8 x half> %mul
+}
+
+define <16 x half> @intrinsic_limited_v16f16(<16 x half> %z, <16 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %ymm1, %ymm0, %ymm2
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %mul = call <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w) #0
+  ret <16 x half> %mul
+}
+
+define <32 x half> @intrinsic_limited_v32f16(<32 x half> %z, <32 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mul = call <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w) #0
+  ret <32 x half> %mul
+}
+
+define <20 x half> @intrinsic_limited_v20f16(<20 x half> %z, <20 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v20f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mul = call <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half> %z, <20 x half> %w) #0
+  ret <20 x half> %mul
+}
+
+; Test the vector size bigger than 512 bits
+define <64 x half> @intrinsic_limited_v64f16(<64 x half> %z, <64 x half> %w) {
+; CHECK-LABEL: intrinsic_limited_v64f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmulcph %zmm2, %zmm0, %zmm4
+; CHECK-NEXT:    vfmulcph %zmm3, %zmm1, %zmm2
+; CHECK-NEXT:    vmovaps %zmm4, %zmm0
+; CHECK-NEXT:    vmovaps %zmm2, %zmm1
+; CHECK-NEXT:    retq
+  %mul = call <64 x half> @llvm.experimental.complex.fmul.v64f16(<64 x half> %z, <64 x half> %w) #0
+  ret <64 x half> %mul
+}
+
+define <2 x half> @intrinsic_slow_v2f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_slow_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm2
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+define <4 x half> @intrinsic_slow_v4f16(<4 x half> %z, <4 x half> %w) {
+; CHECK-LABEL: intrinsic_slow_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm2
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w)
+  ret <4 x half> %mul
+}
+
+define <8 x half> @intrinsic_slow_v8f16(<8 x half> %z, <8 x half> %w) {
+; CHECK-LABEL: intrinsic_slow_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm2
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w)
+  ret <8 x half> %mul
+}
+
+define <16 x half> @intrinsic_slow_v16f16(<16 x half> %z, <16 x half> %w) {
+; CHECK-LABEL: intrinsic_slow_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm2
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    # kill: def $xmm2 killed $xmm2 killed $ymm2
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w)
+  ret <16 x half> %mul
+}
+
+define <32 x half> @intrinsic_slow_v32f16(<32 x half> %z, <32 x half> %w) {
+; CHECK-LABEL: intrinsic_slow_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm2
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    # kill: def $xmm2 killed $xmm2 killed $zmm2
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w)
+  ret <32 x half> %mul
+}
+
+attributes #0 = { "complex-range"="limited" }
diff --git a/llvm/test/Transforms/InstCombine/complex-math.ll b/llvm/test/Transforms/InstCombine/complex-math.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/complex-math.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt < %s -instcombine -S -inst-combine-complex | FileCheck %s
+
+; Check that we match the simple expansions of complex multiplication and
+; division, whether the target complex value is made by returning a struct,
+; vector, or by storing into memory.
+
+%complex.double = type {double, double}
+
+define %complex.double @struct_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @struct_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define <2 x double> @vector_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @vector_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %res = insertelement <2 x double> zeroinitializer, double %x, i32 0
+  %res.1 = insertelement <2 x double> %res, double %y, i32 1
+  ret <2 x double> %res.1
+}
+
+define void @memory_mul(double %a, double %b, double %c, double %d, %complex.double* %dest) {
+; CHECK-LABEL: @memory_mul(
+; CHECK-NEXT:    [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[DEST_REAL]], align 8
+; CHECK-NEXT:    store double [[TMP7]], double* [[DEST_IMAG]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0
+  %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1
+  store double %x, double* %dest.real
+  store double %y, double* %dest.imag
+  ret void
+}
+
+define %complex.double @fast_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fast_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul fast double %b, %d
+  %ad = fmul fast double %a, %d
+  %bc = fmul fast double %b, %c
+  %x = fsub fast double %ac, %bd
+  %y = fadd fast double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @fastish_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fastish_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call ninf <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul nnan ninf nsz double %b, %d
+  %ad = fmul ninf arcp contract double %a, %d
+  %bc = fmul reassoc nsz ninf double %b, %c
+  %x = fsub ninf arcp afn double %ac, %bd
+  %y = fadd afn nnan ninf double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @struct_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @struct_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define <2 x double> @vector_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @vector_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %res = insertelement <2 x double> zeroinitializer, double %x, i32 0
+  %res.1 = insertelement <2 x double> %res, double %y, i32 1
+  ret <2 x double> %res.1
+}
+
+define void @memory_div(double %a, double %b, double %c, double %d, %complex.double* %dest) {
+; CHECK-LABEL: @memory_div(
+; CHECK-NEXT:    [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[DEST_REAL]], align 8
+; CHECK-NEXT:    store double [[TMP7]], double* [[DEST_IMAG]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0
+  %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1
+  store double %x, double* %dest.real
+  store double %y, double* %dest.imag
+  ret void
+}
+
+define %complex.double @fast_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fast_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul fast double %b, %d
+  %ad = fmul fast double %a, %d
+  %bc = fmul fast double %b, %c
+  %cc = fmul fast double %c, %c
+  %dd = fmul fast double %d, %d
+  %scale = fadd fast double %cc, %dd
+  %x_noscale = fadd fast double %ac, %bd
+  %y_noscale = fsub fast double %bc, %ad
+  %x = fdiv fast double %x_noscale, %scale
+  %y = fdiv fast double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @fastish_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fastish_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call arcp <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul arcp contract double %a, %c
+  %bd = fmul arcp afn ninf reassoc double %b, %d
+  %ad = fmul arcp afn ninf double %a, %d
+  %bc = fmul arcp nsz reassoc double %b, %c
+  %cc = fmul arcp nsz afn double %c, %c
+  %dd = fmul arcp nsz double %d, %d
+  %scale = fadd arcp nsz contract nnan reassoc double %cc, %dd
+  %x_noscale = fadd arcp nsz contract ninf nnan double %ac, %bd
+  %y_noscale = fsub arcp nsz contract reassoc double %bc, %ad
+  %x = fdiv arcp ninf nnan reassoc double %x_noscale, %scale
+  %y = fdiv arcp nnan double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync nounwind readnone willreturn }
+; CHECK: attributes #[[ATTR1]] = { "complex-range"="limited" }
+;.