diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -185,6 +185,8 @@
 /// float-to-int conversion instructions.
 CODEGENOPT(StrictFloatCastOverflow, 1, 1)
 
+CODEGENOPT(UseComplexIntrinsics, 1, 0) ///< Use LLVM complex intrinsics
+
 CODEGENOPT(UniformWGSize     , 1, 0) ///< -cl-uniform-work-group-size
 CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss.
 /// Method of Objective-C dispatch to use.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1816,6 +1816,13 @@
           "floating-point expressions are evaluated">,
   NegFlag<SetFalse>>;
 
+defm use_complex_intrinsics : BoolFOption<"use-complex-intrinsics",
+  CodeGenOpts<"UseComplexIntrinsics">, DefaultFalse,
+  PosFlag<SetTrue, [CC1Option],
+          "Determines whether to lower _Complex operations to LLVM complex "
+          "intrinsics">,
+  NegFlag<SetFalse>>;
+
 def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
 def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;
 
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -88,6 +88,13 @@
   ComplexPairTy EmitScalarToComplexCast(llvm::Value *Val, QualType SrcType,
                                         QualType DestType, SourceLocation Loc);
 
+  /// Convert a LLVM complex type representation into a pair of values
+  ComplexPairTy BreakLLVMComplexType(llvm::Value *Val) {
+    llvm::Value *Real = Builder.CreateExtractElement(Val, uint64_t(0));
+    llvm::Value *Imag = Builder.CreateExtractElement(Val, uint64_t(1));
+    return ComplexPairTy(Real, Imag);
+  }
+
   //===--------------------------------------------------------------------===//
   //                            Visitor Methods
   //===--------------------------------------------------------------------===//
@@ -701,6 +708,14 @@
     // still more of this within the type system.
 
     if (Op.LHS.second && Op.RHS.second) {
+      if (CGF.CGM.getCodeGenOpts().UseComplexIntrinsics) {
+        Value *Op0 = Builder.CreateComplexValue(Op.LHS.first, Op.LHS.second);
+        Value *Op1 = Builder.CreateComplexValue(Op.RHS.first, Op.RHS.second);
+        // TODO: Support STDC CX_LIMITED_RANGE here.
+        Value *Result = Builder.CreateComplexMul(Op0, Op1, false);
+        return BreakLLVMComplexType(Result);
+      }
+
       // If both operands are complex, emit the core math directly, and then
       // test for NaNs. If we find NaNs in the result, we delegate to a libcall
       // to carefully re-compute the correct infinity representation if
@@ -794,6 +809,19 @@
 
   llvm::Value *DSTr, *DSTi;
   if (LHSr->getType()->isFloatingPointTy()) {
+    // If we are using complex intrinsics, do so whenever the right-hand side
+    // is complex, since no major simplification is possible in this scenario.
+    // (Simplifications are possible if the LHS is real or pure imaginary).
+    if (CGF.CGM.getCodeGenOpts().UseComplexIntrinsics && RHSi) {
+      llvm::Value *Op0 =
+          Builder.CreateComplexValue(Op.LHS.first, Op.LHS.second);
+      llvm::Value *Op1 =
+          Builder.CreateComplexValue(Op.RHS.first, Op.RHS.second);
+      // TODO: Support STDC CX_LIMITED_RANGE here.
+      llvm::Value *Result = Builder.CreateComplexDiv(Op0, Op1, false);
+      return BreakLLVMComplexType(Result);
+    }
+
     // If we have a complex operand on the RHS and FastMath is not allowed, we
     // delegate to a libcall to handle all of the complexities and minimize
     // underflow/overflow cases. When FastMath is allowed we construct the
diff --git a/clang/test/CodeGen/complex-intrinsics.c b/clang/test/CodeGen/complex-intrinsics.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/complex-intrinsics.c
@@ -0,0 +1,180 @@
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s --check-prefix=NOINTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-pc-win64 -o - | FileCheck %s --check-prefix=NOINTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple i686-unknown-unknown -o - | FileCheck %s --check-prefix=NOINTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple spir -o - | FileCheck %s --check-prefix=NOINTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -fuse-complex-intrinsics -o - | FileCheck %s --check-prefix=INTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-pc-win64 -fuse-complex-intrinsics -o - | FileCheck %s --check-prefix=INTRIN
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -fuse-complex-intrinsics -DT=int -o - | FileCheck %s --check-prefix=INT
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -DT=int -o - | FileCheck %s --check-prefix=INT
+
+#ifndef T
+#define T float
+#endif
+
+T check_var;
+// INTRIN: @check_var = global [[T:[a-z0-9]+]]
+// NOINTRIN: @check_var = global [[T:[a-z0-9]+]]
+// INT: @check_var = global [[T:i[0-9]+]]
+
+T _Complex add_rc(T a, T _Complex b) {
+  // INTRIN-LABEL: @add_rc(
+  // INTRIN-COUNT-1: fadd [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @add_rc(
+  // NOINTRIN-COUNT-1: fadd [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @add_rc(
+  // INT-COUNT-1: add [[T]]
+  // INT: ret
+  return a + b;
+}
+
+T _Complex add_cr(T _Complex a, T b) {
+  // INTRIN-LABEL: @add_cr(
+  // INTRIN-COUNT-1: fadd [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @add_cr(
+  // NOINTRIN-COUNT-1: fadd [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @add_cr(
+  // INT-COUNT-1: add [[T]]
+  // INT: ret
+  return a + b;
+}
+
+T _Complex add_cc(T _Complex a, T _Complex b) {
+  // INTRIN-LABEL: @add_cc(
+  // INTRIN-COUNT-2: fadd [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @add_cc(
+  // NOINTRIN-COUNT-2: fadd [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @add_cc(
+  // INT-COUNT-2: add [[T]]
+  // INT: ret
+  return a + b;
+}
+
+T _Complex sub_rc(T a, T _Complex b) {
+  // INTRIN-LABEL: @sub_rc(
+  // INTRIN: fsub [[T]]
+  // INTRIN: fneg [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @sub_rc(
+  // NOINTRIN: fsub [[T]]
+  // NOINTRIN: fneg [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @sub_rc(
+  // INT-COUNT-2: sub [[T]]
+  // INT: ret
+  return a - b;
+}
+
+T _Complex sub_cr(T _Complex a, T b) {
+  // INTRIN-LABEL: @sub_cr(
+  // INTRIN: fsub [[T]]
+  // INTRIN-NOT: fsub [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @sub_cr(
+  // NOINTRIN: fsub [[T]]
+  // NOINTRIN-NOT: fsub [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @sub_cr(
+  // INT-COUNT-2: sub [[T]]
+  // INT: ret
+  return a - b;
+}
+
+T _Complex sub_cc(T _Complex a, T _Complex b) {
+  // INTRIN-LABEL: @sub_cc(
+  // INTRIN-COUNT-2: fsub [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @sub_cc(
+  // NOINTRIN-COUNT-2: fsub [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @sub_cc(
+  // INT-COUNT-2: sub [[T]]
+  // INT: ret
+  return a - b;
+}
+
+T _Complex mul_rc(T a, T _Complex b) {
+  // INTRIN-LABEL: @mul_rc(
+  // INTRIN-COUNT-2: fmul [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @mul_rc(
+  // NOINTRIN-COUNT-2: fmul [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @mul_rc(
+  // INT-COUNT-4: mul [[T]]
+  // INT: ret
+  return a * b;
+}
+
+T _Complex mul_cr(T _Complex a, T b) {
+  // INTRIN-LABEL: @mul_cr(
+  // INTRIN-COUNT-2: fmul [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @mul_cr(
+  // NOINTRIN-COUNT-2: fmul [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @mul_cr(
+  // INT-COUNT-4: mul [[T]]
+  // INT: ret
+  return a * b;
+}
+
+T _Complex mul_cc(T _Complex a, T _Complex b) {
+  // INTRIN-LABEL: @mul_cc(
+  // INTRIN-NOT: fmul [[T]]
+  // INTRIN: call {{.*}} @llvm.experimental.complex.fmul
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @mul_cc(
+  // NOINTRIN-COUNT-4: fmul [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @mul_cc(
+  // INT-COUNT-4: mul [[T]]
+  // INT: ret
+  return a * b;
+}
+
+T _Complex div_rc(T a, T _Complex b) {
+  // INTRIN-LABEL: @div_rc(
+  // INTRIN-NOT: fdiv [[T]]
+  // INTRIN: call {{.*}} @llvm.experimental.complex.fdiv
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @div_rc(
+  // NOINTRIN: call {{.*}} @__div
+  // NOINTRIN: ret
+  // INT-LABEL: @div_rc(
+  // INT-COUNT-6: mul [[T]]
+  // INT: ret
+  return a / b;
+}
+
+T _Complex div_cr(T _Complex a, T b) {
+  // INTRIN-LABEL: @div_cr(
+  // INTRIN-COUNT-2: fdiv [[T]]
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @div_cr(
+  // NOINTRIN-COUNT-2: fdiv [[T]]
+  // NOINTRIN: ret
+  // INT-LABEL: @div_cr(
+  // INT-COUNT-5: mul [[T]]
+  // INT: ret
+  return a / b;
+}
+
+T _Complex div_cc(T _Complex a, T _Complex b) {
+  // INTRIN-LABEL: @div_cc(
+  // INTRIN-NOT: fdiv [[T]]
+  // INTRIN: call {{.*}} @llvm.experimental.complex.fdiv
+  // INTRIN: ret
+  // NOINTRIN-LABEL: @div_cc(
+  // NOINTRIN: call {{.*}} @__div
+  // NOINTRIN: ret
+  // INT-LABEL: @div_cc(
+  // INT-COUNT-6: mul [[T]]
+  // INT: ret
+  return a / b;
+}
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -17238,6 +17238,167 @@
 None.
 
 
+Complex Intrinsics
+------------------
+
+Complex numbers are currently represented, for intrinsic purposes, as vectors of
+floating-point numbers. A scalar complex type is represented using the type
+``<2 x floatty>``, with index ``0`` corresponding to the real part of the number
+and index ``1`` corresponding the imaginary part of the number. A vector complex
+type can be represented by an even-length vector of floating-point numbers,
+with even indices (``0``, ``2``, etc.) corresponding to real parts of numbers
+and the indices one larger (``1``, ``3``, etc.) the corresponding imaginary
+parts.
+
+In general, these intrinsics have the same semantics as their definitions in
+Annex G of the C specification. In particular, this means that multiplication,
+division, and absolute value cannot be represented with their regular algebraic
+formulas, as this would produce a NaN value instead of an infinity, or an
+intermediate value may overflow. However, adding the ``complex-limited-range``
+attribute to the call-site will specifically request the regular algebraic
+formula.
+
+In addition to the ``complex-limited-range`` attribute, these intrinsics also
+respect the fast math flags. These flags will be applied to all of the
+floating-point expressions implied by the intrinsic if it were expanded. In
+particular, the ``nnan`` or ``ninf`` flags are sufficient to cancel out all of
+the recalculations when an input is NaN.
+
+Another attribute, ``complex-no-scale``, applies to the division intrinsic. This
+attribute allows the operation to be calculated according to the regular
+algebraic formula without regard for needing to scale the bounds first to
+prevent a potentially spurious overflow. It will still retain the checks for
+infinity as converted to NaN (that can be removed by ``complex-limited-range``).
+
+Intrinsics for complex addition and subtraction are not provided, as these are
+equivalent to ``fadd`` and ``fsub`` instructions, respectively.
+
+'``llvm.experimental.complex.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> <op1>, <2 x float> <op2>)
+      declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> <op1>, <2 x double> <op2>)
+      declare <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> <op1>, <4 x float> <op2>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.complex.fmul``' intrinsic returns the product of its
+two operands.
+
+Arguments:
+""""""""""
+
+The arguments to the '``llvm.experimental.complex.fmul``' intrinsic must be a
+:ref:`vector <t_vector>` of :ref:`floating-point <t_floating>` types of length
+divisible by 2.
+
+Semantics:
+""""""""""
+
+The value produced is the complex product of the two inputs.
+
+If the ``complex-limited-range`` attribute is provided, or the ``noinf`` or
+``nonan`` fast math flags are provided, the output may be equivalent to the
+following code:
+
+.. code-block:: llvm
+
+      declare <2 x float> limited_complex_mul(<2 x float> %op1, <2 x float> %op2) {
+        %x = extractelement <2 x float> %op1, i32 0 ; real of %op1
+        %y = extractelement <2 x float> %op1, i32 1 ; imag of %op1
+        %u = extractelement <2 x float> %op2, i32 0 ; real of %op2
+        %v = extractelement <2 x float> %op2, i32 1 ; imag of %op2
+        %xu = fmul float %x, %u
+        %yv = fmul float %y, %v
+        %yu = fmul float %y, %u
+        %xv = fmul float %x, %v
+        %out_real = fsub float %xu, %yv
+        %out_imag = fadd float %yu, %xv
+        %ret.0 = insertelement <2 x float> undef, i32 0, %out_real
+        %ret.1 = insertelement <2 x float> %ret.0, i32 1, %out_imag
+        return <2 x float> %ret.1
+      }
+
+Without the ``complex-limited-range`` flag and other fast-math flags, the above
+code is insufficient to handle the result. Instead, code must be added to
+check for infinities if either the real or imaginary component of the result is
+a NaN value.
+
+
+'``llvm.experimental.complex.fdiv.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> <op1>, <2 x float> <op2>)
+      declare <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> <op1>, <2 x double> <op2>)
+      declare <4 x float> @llvm.experimental.complex.fdiv.v4f32(<4 x float> <op1>, <4 x float> <op2>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.complex.fdiv``' intrinsic returns the quotient of its
+two operands.
+
+Arguments:
+""""""""""
+
+The arguments to the '``llvm.experimental.complex.fdiv``' intrinsic must be a
+:ref:`vector <t_vector>` of :ref:`floating-point <t_floating>` types of length
+divisible by 2.
+
+Semantics:
+""""""""""
+
+The value produced is the complex quotient of the two inputs.
+
+If the ``complex-limited-range`` attribute is provided, the output will be
+equivalent to the following code:
+
+.. code-block:: llvm
+
+      declare <2 x float> limited_complex_div(<2 x float> %op1, <2 x float> %op2) {
+        %x = extractelement <2 x float> %op1, i32 0 ; real of %op1
+        %y = extractelement <2 x float> %op1, i32 1 ; imag of %op1
+        %u = extractelement <2 x float> %op2, i32 0 ; real of %op2
+        %v = extractelement <2 x float> %op2, i32 1 ; imag of %op2
+        %xu = fmul float %x, %u
+        %yv = fmul float %y, %v
+        %yu = fmul float %y, %u
+        %xv = fmul float %x, %v
+        %uu = fmul float %u, %u
+        %vv = fmul float %v, %v
+        %unscaled_real = fadd float %xu, %yv
+        %unscaled_imag = fsub float %yu, %xv
+        %scale = fadd float %uu, %vv
+        %out_real = fdiv float %unscaled_real, %scale
+        %out_imag = fdiv float %unscaled_imag, %scale
+        %ret.0 = insertelement <2 x float> undef, i32 0, %out_real
+        %ret.1 = insertelement <2 x float> %ret.0, i32 1, %out_imag
+        return <2 x float> %ret.1
+      }
+
+Without the ``complex-limited-range`` attribute, the above code would be an
+insufficient implementation. Instead, code is needed to scale the input values
+to prevent potential overflow; this is true even if the ``nnan`` and ``ninf``
+flags are specified. The ``arcp`` fast math flag may also be useful, as it will
+permit the divisions to be replaced with multiplications with a reciprocal
+instead.
+
+The ``complex-no-scale`` attribute (implied by ``complex-limited-range``) can be
+used to eliminate the necessary scaling requirements.
+
 Matrix Intrinsics
 -----------------
 
diff --git a/llvm/include/llvm/CodeGen/ExpandComplex.h b/llvm/include/llvm/CodeGen/ExpandComplex.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/ExpandComplex.h
@@ -0,0 +1,25 @@
+//===---- ExpandComplex.h - Expand experimental complex intrinsics --------===//
+//
+// Copyright (C) 2021 Intel Corporation. All rights reserved.
+//
+// The information and source code contained herein is the exclusive
+// property of Intel Corporation and may not be disclosed, examined
+// or reproduced in whole or in part without explicit written authorization
+// from the company.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDCOMPLEX_H
+#define LLVM_CODEGEN_EXPANDCOMPLEX_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandComplexPass : public PassInfoMixin<ExpandComplexPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDCOMPLEX_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -473,6 +473,10 @@
   /// printing assembly.
   ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true);
 
+  /// This pass expands the experimental complex intrinsics into regular
+  /// floating-point arithmetic or calls to __mulsc3 (or similar) functions.
+  FunctionPass *createExpandComplexPass();
+
   /// This pass expands the experimental reduction intrinsics into sequences of
   /// shuffles.
   FunctionPass *createExpandReductionsPass();
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -647,6 +647,23 @@
     return false;
   }
 
+  /// Enum that specifies how a C complex type is lowered (in LLVM type terms).
+  enum class ComplexABI {
+    Memory, ///< Indicates that a pointer to the struct is passed.
+    Vector, ///< Indicates that T _Complex can be passed as <2 x T>.
+    Struct, ///< Indicates that T _Complex can be passed as {T, T}.
+  };
+
+  /// Returns how a C complex type is lowered when used as the return value.
+  virtual ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const {
+    return ComplexABI::Struct;
+  }
+
+  /// Returns true if the target can match the @llvm.intel.complex.fmul
+  /// intrinsic with the given type. Such an intrinsic is assumed will only be
+  /// matched when "complex-limited-range" is in effect.
+  virtual bool hasComplexMultiply(Type *FloatTy) const { return false; }
+
   /// Return if the target supports combining a
   /// chain like:
   /// \code
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -301,6 +301,18 @@
 def ProfileSampleAccurate : StrBoolAttr<"profile-sample-accurate">;
 def UseSampleProfile : StrBoolAttr<"use-sample-profile">;
 
+/// This range indicates that complex multiply, division, and absolute value
+/// expressions can be simplified to their trivial mathematical expressions. It
+/// has an equivalent effect to specifying the STDC CX_LIMITED_RANGE pragma in
+/// C complex arithmetic code.
+def ComplexLimitedRange : StrBoolAttr<"complex-limited-range">;
+
+/// This range indicates that complex division expressions do not need to have
+/// any pre-scaling of their values before doing arithmetic and can instead use
+/// the trivial mathematical expression to be computed. Unlike
+/// complex-limited-range, however, the NaN processing is still required.
+def ComplexNoScale : StrBoolAttr<"complex-no-scale">;
+
 class CompatRule<string F> {
   // The name of the function called to check the attribute of the caller and
   // callee and decide whether inlining should be allowed. The function's
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1636,6 +1636,34 @@
   Value *CreateNAryOp(unsigned Opc, ArrayRef<Value *> Ops,
                       const Twine &Name = "", MDNode *FPMathTag = nullptr);
 
+  /// Construct a complex value out of a pair of real and imaginary values.
+  /// The resulting value will be a vector, with lane 0 being the real value and
+  /// lane 1 being the complex value.
+  /// Either the \p Real or \p Imag parameter may be null, if the input is a
+  /// pure real or pure imaginary number.
+  Value *CreateComplexValue(Value *Real, Value *Imag, const Twine &Name = "") {
+    Type *ScalarTy = (Real ? Real : Imag)->getType();
+    assert(ScalarTy->isFloatingPointTy() &&
+           "Only floating-point types may be complex values.");
+    Type *ComplexTy = FixedVectorType::get(ScalarTy, 2);
+    Value *Base = PoisonValue::get(ComplexTy);
+    if (Real)
+      Base = CreateInsertElement(Base, Real, uint64_t(0), Name);
+    if (Imag)
+      Base = CreateInsertElement(Base, Imag, uint64_t(1), Name);
+    return Base;
+  }
+
+  /// Construct a complex multiply operation, setting fast-math flags and the
+  /// complex-limited-range attribute as appropriate.
+  Value *CreateComplexMul(Value *L, Value *R, bool CxLimitedRange,
+                          const Twine &Name = "");
+
+  /// Construct a complex divide operation, setting fast-math flags and the
+  /// complex-limited-range and complex-no-scale attributes as appropriate.
+  Value *CreateComplexDiv(Value *L, Value *R, bool CxLimitedRange,
+                          bool CxNoScale = false, const Twine &Name = "");
+
   //===--------------------------------------------------------------------===//
   // Instruction creation methods: Memory Instructions
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1730,6 +1730,16 @@
                                          [llvm_anyvector_ty]>;
 }
 
+//===----- Complex math intrinsics ----------------------------------------===//
+
+def int_experimental_complex_fmul: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                            [LLVMMatchType<0>,LLVMMatchType<0>],
+                                            [IntrNoMem]>;
+
+def int_experimental_complex_fdiv: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                            [LLVMMatchType<0>,LLVMMatchType<0>],
+                                            [IntrNoMem]>;
+
 //===----- Matrix intrinsics ---------------------------------------------===//
 
 def int_matrix_transpose
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -153,6 +153,7 @@
 void initializeEHContGuardCatchretPass(PassRegistry &);
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
 void initializeEntryExitInstrumenterPass(PassRegistry&);
+void initializeExpandComplexPass(PassRegistry &);
 void initializeExpandMemCmpPassPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -26,6 +26,7 @@
   EdgeBundles.cpp
   EHContGuardCatchret.cpp
   ExecutionDomainFix.cpp
+  ExpandComplex.cpp
   ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
diff --git a/llvm/lib/CodeGen/ExpandComplex.cpp b/llvm/lib/CodeGen/ExpandComplex.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/CodeGen/ExpandComplex.cpp
@@ -0,0 +1,282 @@
+//===-- ExpandComplex.cpp - Expand experimental complex intrinsics --------===//
+//
+// Copyright (C) 2021 Intel Corporation. All rights reserved.
+//
+// The information and source code contained herein is the exclusive
+// property of Intel Corporation and may not be disclosed, examined
+// or reproduced in whole or in part without explicit written authorization
+// from the company.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for complex intrinsics, allowing targets
+// to enable the intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandComplex.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+bool expandComplexInstruction(IntrinsicInst *CI, const TargetLowering *TLI,
+                              const DataLayout &DL) {
+  Intrinsic::ID Opcode = CI->getIntrinsicID();
+  assert((Opcode == Intrinsic::experimental_complex_fmul ||
+          Opcode == Intrinsic::experimental_complex_fdiv) &&
+         "Expected a complex instruction");
+
+  // Break the input values up into real and imaginary pieces.
+  Type *ComplexVectorTy = CI->getArgOperand(0)->getType();
+  Type *FloatTy = ComplexVectorTy->getScalarType();
+  IRBuilder<> Builder(CI);
+  Builder.setFastMathFlags(CI->getFastMathFlags());
+  Value *LhsR = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(0));
+  Value *LhsI = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(1));
+  Value *RhsR = nullptr, *RhsI = nullptr;
+  RhsR = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(0));
+  RhsI = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(1));
+
+  // The expansion has three pieces: the naive arithmetic, a possible prescaling
+  // (not relevant for multiplication), and a step to convert NaN output values
+  // to infinity values in certain situations (see Annex G of the C
+  // specification for more details).
+  //
+  // For now, we use the compiler-rt function directly if we need either of the
+  // latter two pieces; otherwise, we do the expansion manually here.
+  Value *OutReal, *OutImag;
+  bool CanExpand = false;
+  // Complex-limited-range explicitly erquests only the naive arithmetic step.
+  if (CI->hasFnAttr("complex-limited-range"))
+    CanExpand = true;
+  else {
+    // The NaN check is essentially structured as
+    // if (isnan(result_real) && isnan(result_imag)) {
+    //   if (isinf(a) || isinf(b)) { /* several statements like this */ }
+    // }
+    // Therefore, setting one of nonan or noinf alone is sufficient to disable
+    // the recalculation check, nonan by disabling the outer if statement, and
+    // noinf by disabling the inner if statements (making the outer one empty).
+    bool SkipNaNCheck =
+        CI->getFastMathFlags().noNaNs() || CI->getFastMathFlags().noInfs();
+    bool HasScale = Opcode != Intrinsic::experimental_complex_fmul &&
+                    !CI->hasFnAttr("complex-no-scale");
+    CanExpand = SkipNaNCheck && !HasScale;
+  }
+  if (!CanExpand) {
+    // Do a call directly to the compiler-rt library here.
+    const char *Name = nullptr;
+    if (Opcode == Intrinsic::experimental_complex_fmul) {
+      if (FloatTy->isHalfTy())
+        Name = "__mulhc3";
+      else if (FloatTy->isFloatTy())
+        Name = "__mulsc3";
+      else if (FloatTy->isDoubleTy())
+        Name = "__muldc3";
+      else if (FloatTy->isX86_FP80Ty())
+        Name = "__mulxc3";
+      else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty())
+        Name = "__multc3";
+    } else if (Opcode == Intrinsic::experimental_complex_fdiv) {
+      if (FloatTy->isHalfTy())
+        Name = "__divhc3";
+      else if (FloatTy->isFloatTy())
+        Name = "__divsc3";
+      else if (FloatTy->isDoubleTy())
+        Name = "__divdc3";
+      else if (FloatTy->isX86_FP80Ty())
+        Name = "__divxc3";
+      else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty())
+        Name = "__divtc3";
+    }
+
+    if (!Name)
+      report_fatal_error("Cannot find libcall for intrinsic");
+
+    // The function we are to call is T complex __name(T, T, T, T) in C terms.
+    // Use TLI to figure out what the appropriate actual ABI for this function.
+    StructType *ComplexStructTy = StructType::get(FloatTy, FloatTy);
+    switch (TLI->getComplexReturnABI(FloatTy)) {
+    case TargetLowering::ComplexABI::Vector: {
+      // When the result is a vector type directly, we can replace the intrinsic
+      // with the call to the underlying function without any other munging.
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, ComplexVectorTy, FloatTy, FloatTy, FloatTy, FloatTy);
+      Value *NewResult = Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI});
+      CI->replaceAllUsesWith(NewResult);
+      CI->eraseFromParent();
+      return true;
+    }
+    case TargetLowering::ComplexABI::Memory: {
+      // Allocate a struct for the return type in the entry block. Stack slot
+      // coloring should remove duplicate allocations.
+      unsigned AllocaAS = DL.getAllocaAddrSpace();
+      Value *Alloca;
+      {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        BasicBlock *EntryBB = &CI->getParent()->getParent()->getEntryBlock();
+        Builder.SetInsertPoint(EntryBB, EntryBB->begin());
+        Alloca = Builder.CreateAlloca(ComplexStructTy, AllocaAS);
+      }
+
+      AttributeList Attrs;
+      AttrBuilder AB(Attrs, 0);
+      AB.addStructRetAttr(ComplexStructTy);
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, Attrs, Type::getVoidTy(CI->getContext()),
+          PointerType::get(ComplexStructTy, AllocaAS), FloatTy, FloatTy,
+          FloatTy, FloatTy);
+
+      Builder.CreateCall(Func, {Alloca, LhsR, LhsI, RhsR, RhsI});
+      OutReal = Builder.CreateLoad(
+          FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 0));
+      OutImag = Builder.CreateLoad(
+          FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 1));
+      break;
+    }
+    case TargetLowering::ComplexABI::Struct: {
+      FunctionCallee Func = CI->getModule()->getOrInsertFunction(
+          Name, ComplexStructTy, FloatTy, FloatTy, FloatTy, FloatTy);
+      Value *ComplexStructRes =
+          Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI});
+      OutReal = Builder.CreateExtractValue(ComplexStructRes, 0);
+      OutImag = Builder.CreateExtractValue(ComplexStructRes, 1);
+      break;
+    }
+    }
+  } else {
+    switch (Opcode) {
+    case Intrinsic::experimental_complex_fmul: {
+      // If the target has a complex_fmul expansion, use that instead of
+      // expanding.
+      if (TLI->hasComplexMultiply(FloatTy))
+        return false;
+
+      OutReal = Builder.CreateFSub(Builder.CreateFMul(LhsR, RhsR),
+                                   Builder.CreateFMul(LhsI, RhsI));
+      OutImag = Builder.CreateFAdd(Builder.CreateFMul(LhsI, RhsR),
+                                   Builder.CreateFMul(LhsR, RhsI));
+      break;
+    }
+    case Intrinsic::experimental_complex_fdiv: {
+      Value *Scale = Builder.CreateFAdd(Builder.CreateFMul(RhsR, RhsR),
+                                        Builder.CreateFMul(RhsI, RhsI));
+      OutReal =
+          Builder.CreateFDiv(Builder.CreateFAdd(Builder.CreateFMul(LhsR, RhsR),
+                                                Builder.CreateFMul(LhsI, RhsI)),
+                             Scale);
+      OutImag =
+          Builder.CreateFDiv(Builder.CreateFSub(Builder.CreateFMul(LhsI, RhsR),
+                                                Builder.CreateFMul(LhsR, RhsI)),
+                             Scale);
+      break;
+    }
+    }
+  }
+
+  // Replace all of the uses of the intrinsic with OutReal/OutImag. We avoid
+  // creating the vector unless we have to.
+  bool HasVectorUse = false;
+  for (User *U : CI->users()) {
+    uint64_t Index;
+    if (match(U, m_ExtractElt(m_Value(), m_ConstantInt(Index)))) {
+      assert((Index == 0 || Index == 1) && "Extract element too small");
+      U->replaceAllUsesWith(Index == 0 ? OutReal : OutImag);
+    } else {
+      HasVectorUse = true;
+    }
+  }
+
+  if (HasVectorUse) {
+    Value *OutComplex = Builder.CreateInsertElement(
+        Builder.CreateInsertElement(UndefValue::get(ComplexVectorTy), OutReal,
+                                    uint64_t(0)),
+        OutImag, uint64_t(1));
+    CI->replaceAllUsesWith(OutComplex);
+  } else {
+    CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool expandComplexIntrinsics(Function &F, const TargetLowering *TLI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst *, 4> Worklist;
+  for (auto &I : instructions(F)) {
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      default:
+        break;
+      case Intrinsic::experimental_complex_fmul:
+      case Intrinsic::experimental_complex_fdiv:
+        Worklist.push_back(II);
+        break;
+      }
+    }
+  }
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (auto *II : Worklist) {
+    Changed |= expandComplexInstruction(II, TLI, DL);
+  }
+  return Changed;
+}
+
+class ExpandComplex : public FunctionPass {
+public:
+  static char ID;
+  ExpandComplex() : FunctionPass(ID) {
+    initializeExpandComplexPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const TargetMachine *TM =
+        &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
+    const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
+    return expandComplexIntrinsics(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+char ExpandComplex::ID;
+INITIALIZE_PASS_BEGIN(ExpandComplex, "expand-complex",
+                      "Expand complex intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(ExpandComplex, "expand-complex",
+                    "Expand complex intrinsics", false, false)
+
+FunctionPass *llvm::createExpandComplexPass() { return new ExpandComplex(); }
+
+PreservedAnalyses ExpandComplexPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  /*const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();*/
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -922,6 +922,10 @@
   // Allow disabling it for testing purposes.
   if (!DisableExpandReductions)
     addPass(createExpandReductionsPass());
+
+  // If the target doesn't support complex intrinsics, or if they need to be
+  // expanded into more complex calls, generate the expansion to complex calls.
+  addPass(createExpandComplexPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -982,6 +982,33 @@
   return C;
 }
 
+Value *IRBuilderBase::CreateComplexMul(Value *L, Value *R, bool CxLimitedRange,
+                                       const Twine &Name) {
+  CallInst *Result = CreateBinaryIntrinsic(Intrinsic::experimental_complex_fmul,
+                                           L, R, nullptr, Name);
+  Result->setFastMathFlags(FMF);
+  AttributeList Attrs = Result->getAttributes();
+  if (CxLimitedRange)
+    Attrs = Attrs.addFnAttribute(getContext(), "complex-limited-range");
+  Result->setAttributes(Attrs);
+  return Result;
+}
+
+Value *IRBuilderBase::CreateComplexDiv(Value *L, Value *R, bool CxLimitedRange,
+                                       bool CxNoScale, const Twine &Name) {
+  CallInst *Result = CreateBinaryIntrinsic(Intrinsic::experimental_complex_fdiv,
+                                           L, R, nullptr, Name);
+  Result->setFastMathFlags(FMF);
+  AttributeList Attrs = Result->getAttributes();
+  if (CxLimitedRange)
+    Attrs = Attrs.addFnAttribute(getContext(), "complex-limited-range");
+  // complex-limited-range implies complex-no-scale
+  if (CxNoScale || CxLimitedRange)
+    Attrs = Attrs.addFnAttribute(getContext(), "complex-no-scale");
+  Result->setAttributes(Attrs);
+  return Result;
+}
+
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
   if (auto *CC = dyn_cast<Constant>(C))
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5395,6 +5395,18 @@
            &Call);
     break;
   }
+  case Intrinsic::experimental_complex_fdiv:
+  case Intrinsic::experimental_complex_fmul: {
+    // Check that the vector type is a pair of floating-point types.
+    Type *ArgTy = Call.getArgOperand(0)->getType();
+    FixedVectorType *VectorTy = dyn_cast<FixedVectorType>(ArgTy);
+    Assert(VectorTy && VectorTy->getNumElements() % 2 == 0 &&
+               VectorTy->getElementType()->isFloatingPointTy(),
+           "complex intrinsic must use an even-length vector of floating-point "
+           "types",
+           &Call);
+    break;
+  }
   };
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -979,6 +979,8 @@
     /// legal as the hook is used before type legalization.
     bool isSafeMemOpType(MVT VT) const override;
 
+    ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const override;
+
     /// Returns true if the target allows unaligned memory accesses of the
     /// specified type. Returns whether it is "fast" in the last argument.
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2563,6 +2563,32 @@
   return MVT::i32;
 }
 
+TargetLoweringBase::ComplexABI
+X86TargetLowering::getComplexReturnABI(Type *ScalarFloatTy) const {
+  if (Subtarget.is32Bit()) {
+    if (ScalarFloatTy->isFloatTy()) {
+      report_fatal_error("Cannot compile complex return ABI for i386 ABI");
+    } else if (ScalarFloatTy->isHalfTy()) {
+      return ComplexABI::Vector;
+    } else {
+      return ComplexABI::Memory;
+    }
+  } else {
+    // The x86-64 ABI specifies that (save for x86-fp80), this is handled as a
+    // regular C struct. This means that float and smaller get packed into a
+    // single vector in xmm0; double and x86-fp80 (by special case) return two
+    // values; and larger types than x86-fp80 (i.e., fp128) returns via memory.
+    unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize();
+    if (FloatSize <= 32) {
+      return ComplexABI::Vector;
+    } else if (FloatSize <= 80) {
+      return ComplexABI::Struct;
+    } else {
+      return ComplexABI::Memory;
+    }
+  }
+}
+
 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   if (VT == MVT::f32)
     return X86ScalarSSEf32;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -737,6 +737,10 @@
   ///
   /// If the multiplication is known not to overflow then NoSignedWrap is set.
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+
+  /// Try to match a complex intrinsic that produces the given real/imaginary
+  /// pair. Returns whether or not it was successful.
+  bool createComplexMathInstruction(Value *Real, Value *Imag);
 };
 
 class Negator final {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1413,6 +1413,33 @@
         eraseInstFromFunction(*PrevSI);
         return nullptr;
       }
+
+      // Is this potentially a complex instruction?
+      auto OurGEP = dyn_cast<GetElementPtrInst>(Ptr);
+      auto TheirGEP = dyn_cast<GetElementPtrInst>(PrevSI->getOperand(1));
+      if (PrevSI->isUnordered() && OurGEP && TheirGEP &&
+          OurGEP->getOperand(0) == TheirGEP->getOperand(0) &&
+          OurGEP->getNumIndices() == TheirGEP->getNumIndices() &&
+          OurGEP->getType() == TheirGEP->getType()) {
+        bool AllMatch = true;
+        unsigned LastIndex = OurGEP->getNumIndices();
+        for (unsigned Index = 1; Index < LastIndex; Index++) {
+          if (OurGEP->getOperand(Index) != TheirGEP->getOperand(Index)) {
+            AllMatch = false;
+            break;
+          }
+        }
+        if (!AllMatch)
+          break;
+        if (match(OurGEP->getOperand(LastIndex), m_ConstantInt<1>()) &&
+            match(TheirGEP->getOperand(LastIndex), m_ConstantInt<0>())) {
+          IRBuilderBase::InsertPointGuard Guard(Builder);
+          Builder.SetInsertPoint(PrevSI);
+          if (createComplexMathInstruction(PrevSI->getOperand(0), Val))
+            return &SI;
+        }
+      }
+
       break;
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1109,6 +1109,21 @@
   if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
     return NewI;
 
+  // Check if this is potentially a complex instruction that has been manually
+  // expanded.
+  ArrayRef<Type *> Fields = I.getType()->subtypes();
+  if (Fields.size() == 2 && Fields[0] == Fields[1] &&
+      Fields[0]->isFloatingPointTy()) {
+    Value *RealV, *ImgV;
+    if (match(&I, m_InsertValue<1>(m_InsertValue<0>(m_Value(), m_Value(RealV)),
+                                   m_Value(ImgV)))) {
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(cast<Instruction>(I.getOperand(0)));
+      if (createComplexMathInstruction(RealV, ImgV))
+        return &I;
+    }
+  }
+
   return nullptr;
 }
 
@@ -1589,6 +1604,17 @@
   if (Instruction *Ext = narrowInsElt(IE, Builder))
     return Ext;
 
+  // Check for a potential computation of a complex instruction.
+  ElementCount Count = IE.getType()->getElementCount();
+  Value *RealV, *ImagV;
+  if (!Count.isScalable() && Count.getFixedValue() == 2 &&
+      match(&IE, m_InsertElt(
+                     m_InsertElt(m_Value(), m_Value(RealV), m_ConstantInt<0>()),
+                     m_Value(ImagV), m_ConstantInt<1>()))) {
+    if (createComplexMathInstruction(RealV, ImagV))
+      return &IE;
+  }
+
   return nullptr;
 }
 
@@ -2793,3 +2819,120 @@
 
   return MadeChange ? &SVI : nullptr;
 }
+
+static cl::opt<bool> InstCombineComplex(
+    "inst-combine-complex",
+    cl::desc("Enable pattern match to llvm.complex.* intrinsics"));
+
+bool InstCombinerImpl::createComplexMathInstruction(Value *Real, Value *Imag) {
+  if (!InstCombineComplex)
+    return false;
+
+  Instruction *RealI = dyn_cast<Instruction>(Real);
+  Instruction *ImagI = dyn_cast<Instruction>(Imag);
+  if (!RealI || !ImagI)
+    return false;
+
+  // Don't try to handle vector instructions for now.
+  if (RealI->getType()->isVectorTy())
+    return false;
+
+  Value *Op0R, *Op0I, *Op1R, *Op1I, *Scale, *Numerator;
+  // Compute the intersection of all the fast math flags of the entire tree up
+  // to the point that the input complex numbers are specified.
+  auto computeFMF = [&]() {
+    SmallVector<Instruction *, 8> Worklist = {RealI, ImagI};
+    FastMathFlags Flags;
+    Flags.set();
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.back();
+      Worklist.pop_back();
+      Flags &= I->getFastMathFlags();
+      for (Use &U : I->operands()) {
+        Value *V = U.get();
+        if (V == Op0R || V == Op0I || V == Op1R || V == Op1I)
+          continue;
+        Worklist.push_back(cast<Instruction>(V));
+      }
+    }
+    return Flags;
+  };
+
+  Intrinsic::ID NewIntrinsic = Intrinsic::not_intrinsic;
+  // Check for complex multiply:
+  // real = op0.real * op1.real - op0.imag * op1.imag
+  // imag = op0.real * op1.imag + op1.imag * op0.real
+  if (match(Real, m_FSub(m_OneUse(m_FMul(m_Value(Op0R), m_Value(Op1R))),
+                         m_OneUse(m_FMul(m_Value(Op0I), m_Value(Op1I)))))) {
+    if (match(
+            Imag,
+            m_c_FAdd(m_OneUse(m_c_FMul(m_Specific(Op0R), m_Specific(Op1I))),
+                     m_OneUse(m_c_FMul(m_Specific(Op1R), m_Specific(Op0I)))))) {
+      NewIntrinsic = Intrinsic::experimental_complex_fmul;
+    }
+  }
+  // Check for complex div:
+  // real = (op0.real * op1.real + op0.imag * op1.imag) / scale
+  // imag = (op0.imag * op1.real - op0.real * op1.imag) / scale
+  // where scale = op1.real * op1.real + op1.imag * op1.imag
+  else if (match(Imag, m_FDiv(m_Value(Numerator), m_Value(Scale)))) {
+    if (match(Scale,
+              m_FAdd(m_OneUse(m_FMul(m_Value(Op1R), m_Deferred(Op1R))),
+                     m_OneUse(m_FMul(m_Value(Op1I), m_Deferred(Op1I)))))) {
+      // The matching of Op1R and Op1I are temporary, we may need to reverse the
+      // assignments.
+      auto checkNumerator = [&]() {
+        return match(Numerator,
+                     m_OneUse(m_FSub(
+                         m_OneUse(m_c_FMul(m_Value(Op0I), m_Specific(Op1R))),
+                         m_OneUse(m_c_FMul(m_Value(Op0R), m_Specific(Op1I))))));
+      };
+      bool ImagMatches = checkNumerator();
+      if (!ImagMatches) {
+        std::swap(Op1R, Op1I);
+        ImagMatches = checkNumerator();
+      }
+      if (ImagMatches &&
+          match(Real,
+                m_FDiv(m_OneUse(m_c_FAdd(m_OneUse(m_c_FMul(m_Specific(Op0R),
+                                                           m_Specific(Op1R))),
+                                         m_OneUse(m_c_FMul(m_Specific(Op0I),
+                                                           m_Specific(Op1I))))),
+                       m_Specific(Scale)))) {
+        NewIntrinsic = Intrinsic::experimental_complex_fdiv;
+      }
+    }
+  }
+
+  // Make sure we matched an intrinsic.
+  if (NewIntrinsic == Intrinsic::not_intrinsic)
+    return false;
+
+  // Use the computation tree to capture all of the fast-math flags.
+  IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
+  Builder.setFastMathFlags(computeFMF());
+
+  Value *Op0 = Builder.CreateComplexValue(Op0R, Op0I);
+  Value *Op1 = Builder.CreateComplexValue(Op1R, Op1I);
+
+  // Create new intrinsics. From our pattern matching of only the direct
+  // arithmetic formulas, we have to create them with the complex-limited-range.
+  Value *Result;
+  switch (NewIntrinsic) {
+  case Intrinsic::experimental_complex_fmul:
+    Result = Builder.CreateComplexMul(Op0, Op1, true);
+    break;
+  case Intrinsic::experimental_complex_fdiv:
+    Result = Builder.CreateComplexDiv(Op0, Op1, true);
+    break;
+  default:
+    llvm_unreachable("Unexpected complex intrinsic");
+  }
+
+  replaceInstUsesWith(*RealI,
+                      Builder.CreateExtractElement(Result, uint64_t(0)));
+  replaceInstUsesWith(*ImagI,
+                      Builder.CreateExtractElement(Result, uint64_t(1)));
+
+  return true;
+}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -29,6 +29,7 @@
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:       Expand complex intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
 ; CHECK-NEXT:       Safe Stack instrumentation pass
diff --git a/llvm/test/CodeGen/X86/complex-32bit.ll b/llvm/test/CodeGen/X86/complex-32bit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-32bit.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 32-bit.
+
+declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>)
+declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>)
+
+define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    calll __mulhc3@PLT
+; CHECK-NEXT:    addl $24, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -24
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+; Skip intrinsic_f32--we don't support complex float on 32-bit for now.
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __muldc3@PLT
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    addl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) {
+; CHECK-LABEL: intrinsic_f80:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $92, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __mulxc3@PLT
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+  %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w)
+  ret <2 x x86_fp80> %mul
+}
+
+define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) {
+; CHECK-LABEL: intrinsic_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    subl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset %esi, -20
+; CHECK-NEXT:    .cfi_offset %edi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 12
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:    calll __multc3@PLT
+; CHECK-NEXT:    addl $80, %esp
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -80
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, 28(%esi)
+; CHECK-NEXT:    movl %eax, 24(%esi)
+; CHECK-NEXT:    movl %ebp, 20(%esi)
+; CHECK-NEXT:    movl %ebx, 16(%esi)
+; CHECK-NEXT:    movl %edi, 12(%esi)
+; CHECK-NEXT:    movl %edx, 8(%esi)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 4(%esi)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, (%esi)
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addl $60, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl $4
+  %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w)
+  ret <2 x fp128> %mul
+}
+
diff --git a/llvm/test/CodeGen/X86/complex-64bit.ll b/llvm/test/CodeGen/X86/complex-64bit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-64bit.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Check that we handle the ABI of the complex functions correctly for 32-bit.
+
+declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>)
+declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>)
+
+define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) {
+; CHECK-LABEL: intrinsic_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __mulhc3@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w)
+  ret <2 x half> %mul
+}
+
+define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __mulsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) {
+; CHECK-LABEL: intrinsic_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; CHECK-NEXT:    callq __muldc3@PLT
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w)
+  ret <2 x double> %mul
+}
+
+define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) {
+; CHECK-LABEL: intrinsic_f80:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt (%rsp)
+; CHECK-NEXT:    callq __mulxc3@PLT
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w)
+  ret <2 x x86_fp80> %mul
+}
+
+define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) {
+; CHECK-LABEL: intrinsic_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movq %rsp, %rdi
+; CHECK-NEXT:    callq __multc3@PLT
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w)
+  ret <2 x fp128> %mul
+}
+
diff --git a/llvm/test/CodeGen/X86/complex-divide.ll b/llvm/test/CodeGen/X86/complex-divide.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-divide.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Check the expansion of the complex divide intrinsic. This only tests
+; expansion for 32-bit floats, as the expansion should produce identical IR
+; expansions save for the ABI of calling __divsc3, which is tested (indirectly)
+; for each type individually in intel-complex-{32,64}bit.ll.
+
+declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float>, <2 x float>)
+
+; Generate a call to __divsc3
+define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_slow_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __divsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %div
+}
+
+; Do not do an expansion (because fast is not sufficient to imply full
+; complex-limited-range).
+define <2 x float> @intrinsic_implied_not_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_implied_not_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    vmovaps %xmm1, %xmm2
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-NEXT:    callq __divsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %div
+}
+
+; Do an expansion (because nnan/ninf + "complex-no-scale"). No arcp nor fma
+; should be propagated.
+define <2 x float> @intrinsic_implied_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_implied_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm4
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm5
+; CHECK-NEXT:    vdivss %xmm4, %xmm5, %xmm5
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vdivss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3]
+; CHECK-NEXT:    retq
+  %div = call nnan ninf <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #2
+  ret <2 x float> %div
+}
+
+; Do an expansion (because of complex-limited-range).
+define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm4
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm5
+; CHECK-NEXT:    vdivss %xmm4, %xmm5, %xmm5
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vdivss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3]
+; CHECK-NEXT:    retq
+  %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %div
+}
+
+; Do an expansion, and use the FMA (because of fast-math flags).
+define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_fast_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-NEXT:    vmulss %xmm3, %xmm3, %xmm4
+; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm4 = (xmm1 * xmm1) + xmm4
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm5
+; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm5 = (xmm0 * xmm1) + xmm5
+; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vdivss %xmm4, %xmm6, %xmm4
+; CHECK-NEXT:    vmulss %xmm4, %xmm5, %xmm5
+; CHECK-NEXT:    vmulss %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm2 * xmm1) - xmm0
+; CHECK-NEXT:    vmulss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3]
+; CHECK-NEXT:    retq
+  %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %div
+}
+
+attributes #0 = { "complex-limited-range"="true" }
+attributes #1 = { "target-features"="+fma" }
+attributes #2 = { "complex-no-scale"="true" }
diff --git a/llvm/test/CodeGen/X86/complex-multiply.ll b/llvm/test/CodeGen/X86/complex-multiply.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/complex-multiply.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Check the expansion of the complex multiply intrinsic. This only tests
+; expansion for 32-bit floats, as the expansion should produce identical IR
+; expansions save for ABI of calling __mulsc3, which is tested for each type
+; individually in intel-complex-{32,64}bit.ll.
+
+declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>)
+
+; Generate a call to __mulsc3
+define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) {
+; CHECK-LABEL: intrinsic_slow_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-NEXT:    movaps %xmm2, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; CHECK-NEXT:    callq __mulsc3@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+; Do an expansion (because of fast-math flags).
+define <2 x float> @intrinsic_implied_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_implied_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vsubss {{.*}} %xmm4
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3]
+; CHECK-NEXT:    retq
+  %mul = call nnan ninf <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+; Do an expansion (because of complex-limited-range).
+define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_limited_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vsubss {{.*}} %xmm4
+; CHECK-COUNT-2: vmulss
+; CHECK-NEXT:    vaddss {{.*}} %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3]
+; CHECK-NEXT:    retq
+  %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) #0
+  ret <2 x float> %mul
+}
+
+; Do an expansion, and use the FMA (because of fast-math flags).
+define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 {
+; CHECK-LABEL: intrinsic_fast_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm4
+; CHECK-NEXT:    vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
+; CHECK-NEXT:    vmulss %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm2 * xmm1) + xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3]
+; CHECK-NEXT:    retq
+  %mul = call fast <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w)
+  ret <2 x float> %mul
+}
+
+attributes #0 = { "complex-limited-range"="true" }
+attributes #1 = { "target-features"="+fma" }
diff --git a/llvm/test/Transforms/InstCombine/complex-math.ll b/llvm/test/Transforms/InstCombine/complex-math.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/complex-math.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt < %s -instcombine -S -inst-combine-complex | FileCheck %s
+
+; Check that we match the simple expansions of complex multiplication and
+; division, whether the target complex value is made by returning a struct,
+; vector, or by storing into memory.
+
+%complex.double = type {double, double}
+
+define %complex.double @struct_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @struct_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define <2 x double> @vector_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @vector_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %res = insertelement <2 x double> zeroinitializer, double %x, i32 0
+  %res.1 = insertelement <2 x double> %res, double %y, i32 1
+  ret <2 x double> %res.1
+}
+
+define void @memory_mul(double %a, double %b, double %c, double %d, %complex.double* %dest) {
+; CHECK-LABEL: @memory_mul(
+; CHECK-NEXT:    [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[DEST_REAL]], align 8
+; CHECK-NEXT:    store double [[TMP7]], double* [[DEST_IMAG]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %x = fsub double %ac, %bd
+  %y = fadd double %ad, %bc
+  %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0
+  %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1
+  store double %x, double* %dest.real
+  store double %y, double* %dest.imag
+  ret void
+}
+
+define %complex.double @fast_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fast_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul fast double %b, %d
+  %ad = fmul fast double %a, %d
+  %bc = fmul fast double %b, %c
+  %x = fsub fast double %ac, %bd
+  %y = fadd fast double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @fastish_mul(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fastish_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call ninf <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul nnan ninf nsz double %b, %d
+  %ad = fmul ninf arcp contract double %a, %d
+  %bc = fmul reassoc nsz ninf double %b, %c
+  %x = fsub ninf arcp afn double %ac, %bd
+  %y = fadd afn nnan ninf double %ad, %bc
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @struct_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @struct_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define <2 x double> @vector_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @vector_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %res = insertelement <2 x double> zeroinitializer, double %x, i32 0
+  %res.1 = insertelement <2 x double> %res, double %y, i32 1
+  ret <2 x double> %res.1
+}
+
+define void @memory_div(double %a, double %b, double %c, double %d, %complex.double* %dest) {
+; CHECK-LABEL: @memory_div(
+; CHECK-NEXT:    [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[DEST_REAL]], align 8
+; CHECK-NEXT:    store double [[TMP7]], double* [[DEST_IMAG]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ac = fmul double %a, %c
+  %bd = fmul double %b, %d
+  %ad = fmul double %a, %d
+  %bc = fmul double %b, %c
+  %cc = fmul double %c, %c
+  %dd = fmul double %d, %d
+  %scale = fadd double %cc, %dd
+  %x_noscale = fadd double %ac, %bd
+  %y_noscale = fsub double %bc, %ad
+  %x = fdiv double %x_noscale, %scale
+  %y = fdiv double %y_noscale, %scale
+  %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0
+  %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1
+  store double %x, double* %dest.real
+  store double %y, double* %dest.imag
+  ret void
+}
+
+define %complex.double @fast_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fast_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul fast double %a, %c
+  %bd = fmul fast double %b, %d
+  %ad = fmul fast double %a, %d
+  %bc = fmul fast double %b, %c
+  %cc = fmul fast double %c, %c
+  %dd = fmul fast double %d, %d
+  %scale = fadd fast double %cc, %dd
+  %x_noscale = fadd fast double %ac, %bd
+  %y_noscale = fsub fast double %bc, %ad
+  %x = fdiv fast double %x_noscale, %scale
+  %y = fdiv fast double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+define %complex.double @fastish_div(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: @fastish_div(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call arcp <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0
+; CHECK-NEXT:    [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1
+; CHECK-NEXT:    ret [[COMPLEX_DOUBLE]] [[RES_1]]
+;
+  %ac = fmul arcp contract double %a, %c
+  %bd = fmul arcp afn ninf reassoc double %b, %d
+  %ad = fmul arcp afn ninf double %a, %d
+  %bc = fmul arcp nsz reassoc double %b, %c
+  %cc = fmul arcp nsz afn double %c, %c
+  %dd = fmul arcp nsz double %d, %d
+  %scale = fadd arcp nsz contract nnan reassoc double %cc, %dd
+  %x_noscale = fadd arcp nsz contract ninf nnan double %ac, %bd
+  %y_noscale = fsub arcp nsz contract reassoc double %bc, %ad
+  %x = fdiv arcp ninf nnan reassoc double %x_noscale, %scale
+  %y = fdiv arcp nnan double %y_noscale, %scale
+  %res = insertvalue %complex.double zeroinitializer, double %x, 0
+  %res.1 = insertvalue %complex.double %res, double %y, 1
+  ret %complex.double %res.1
+}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync nounwind readnone willreturn }
+; CHECK: attributes #[[ATTR1]] = { "complex-limited-range" }
+; CHECK: attributes #[[ATTR2]] = { "complex-limited-range" "complex-no-scale" }
+;.
diff --git a/llvm/test/Verifier/complex-intrinsics.ll b/llvm/test/Verifier/complex-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Verifier/complex-intrinsics.ll
@@ -0,0 +1,39 @@
+; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK2 %s
+; RUN: sed -e s/.T3:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK3 %s
+; RUN: sed -e s/.T4:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK4 %s
+
+; Check that a double-valued complex fmul is accepted, and attributes are
+; correct.
+; CHECK1: declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) #[[ATTR:[0-9]+]]
+; CHECK1:  attributes #[[ATTR]] = { nofree nosync nounwind readnone willreturn }
+declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @t1(<2 x double> %a, <2 x double> %b) {
+  %res = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %res
+}
+
+; Test that vector complex values are supported.
+; CHECK2: declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>) #[[ATTR:[0-9]+]]
+; CHECK2:  attributes #[[ATTR]] = { nofree nosync nounwind readnone willreturn }
+declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>)
+define <4 x double> @t2(<4 x double> %a, <4 x double> %b) {
+  %res = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %a, <4 x double> %b)
+  ret <4 x double> %res
+}
+
+; Test that odd-length vectors are not supported.
+; CHECK3: complex intrinsic must use an even-length vector of floating-point types
+;T3: declare <3 x double> @llvm.experimental.complex.fmul.v3f64(<3 x double>, <3 x double>)
+;T3: define <3 x double> @t3(<3 x double> %a, <3 x double> %b) {
+;T3:   %res = call <3 x double> @llvm.experimental.complex.fmul.v3f64(<3 x double> %a, <3 x double> %b)
+;T3:   ret <3 x double> %res
+;T3: }
+
+; Test that non-floating point complex types are not supported.
+; CHECK4: complex intrinsic must use an even-length vector of floating-point types
+;T4: declare <2 x i64> @llvm.experimental.complex.fmul.v2i64(<2 x i64>, <2 x i64>)
+;T4: define <2 x i64> @t4(<2 x i64> %a, <2 x i64> %b) {
+;T4:   %res = call <2 x i64> @llvm.experimental.complex.fmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+;T4:   ret <2 x i64> %res
+;T4: }