diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -403,6 +403,7 @@
     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
     bool optimizeSwitchInst(SwitchInst *SI);
     bool optimizeExtractElementInst(Instruction *Inst);
+    bool optimizeInsertValueInst(InsertValueInst *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
     bool fixupDbgValue(Instruction *I);
     bool placeDbgValues(Function &F);
@@ -7417,6 +7418,38 @@
   return false;
 }
 
+bool CodeGenPrepare::optimizeInsertValueInst(InsertValueInst *Inst) {
+  Value *RealV, *ImgV;
+  if (!match(Inst, m_InsertValue<1>(m_InsertValue<0>(m_Value(), m_Value(RealV)),
+                                    m_Value(ImgV))))
+    return false;
+
+  Value *Op0R, *Op0I, *Op1R, *Op1I;
+  if (!match(RealV, m_FSub(m_FMul(m_Value(Op0R), m_Value(Op1R)),
+                           m_FMul(m_Value(Op0I), m_Value(Op1I)))))
+    return false;
+
+  if (!match(ImgV, m_c_FAdd(m_c_FMul(m_Specific(Op0R), m_Specific(Op1I)),
+                            m_c_FMul(m_Specific(Op1R), m_Specific(Op0I)))))
+    return false;
+
+  auto VecTy = FixedVectorType::get(Op0R->getType(), 2);
+  IRBuilder<> Builder(Inst->getContext());
+  Builder.SetInsertPoint(cast<Instruction>(Inst->getOperand(0)));
+  Value *Vec0Op0 =
+      Builder.CreateInsertElement(UndefValue::get(VecTy), Op0R, 0ull);
+  Value *Vec0Op1 = Builder.CreateInsertElement(Vec0Op0, Op0I, 1ull);
+  Value *Vec1Op0 =
+      Builder.CreateInsertElement(UndefValue::get(VecTy), Op1R, 0ull);
+  Value *Vec1Op1 = Builder.CreateInsertElement(Vec1Op0, Op1I, 1ull);
+  Value *Res = Builder.CreateIntrinsic(Intrinsic::complex_multiply, VecTy,
+                                       {Vec0Op1, Vec1Op1});
+
+  Inst->setOperand(1, Builder.CreateExtractElement(Res, 1));
+  cast<InsertValueInst>(Inst->getOperand(0))
+      ->setOperand(1, Builder.CreateExtractElement(Res, 0ull));
+  return true;
+}
 /// For the instruction sequence of store below, F and I values
 /// are bundled together as an i64 value before being stored into memory.
 /// Sometimes it is more efficient to generate separate stores for F and I,
@@ -7939,6 +7972,8 @@
     return optimizeExtractElementInst(cast<ExtractElementInst>(I));
   case Instruction::Br:
     return optimizeBranch(cast<BranchInst>(I), *TLI);
+  case Instruction::InsertValue:
+    return optimizeInsertValueInst(cast<InsertValueInst>(I));
   }
 
   return false;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S %s | FileCheck %s
+
+target triple = "arm64-apple-ios"
+
+%complex_t = type { float, float }
+
+define %complex_t @complex_multiply_float(%complex_t* %x, %complex_t* %y) local_unnamed_addr #0 {
+; CHECK-LABEL: @complex_multiply_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_R_ADDR1:%.*]] = bitcast %complex_t* [[X:%.*]] to float*
+; CHECK-NEXT:    [[X_R:%.*]] = load float, float* [[X_R_ADDR1]], align 4
+; CHECK-NEXT:    [[X_I_ADDR:%.*]] = getelementptr inbounds [[COMPLEX_T:%.*]], %complex_t* [[X]], i64 0, i32 1
+; CHECK-NEXT:    [[X_I:%.*]] = load float, float* [[X_I_ADDR]], align 4
+; CHECK-NEXT:    [[Y_R_ADDR2:%.*]] = bitcast %complex_t* [[Y:%.*]] to float*
+; CHECK-NEXT:    [[Y_R:%.*]] = load float, float* [[Y_R_ADDR2]], align 4
+; CHECK-NEXT:    [[Y_I_ADDR:%.*]] = getelementptr inbounds [[COMPLEX_T]], %complex_t* [[Y]], i64 0, i32 1
+; CHECK-NEXT:    [[Y_I:%.*]] = load float, float* [[Y_I_ADDR]], align 4
+; CHECK-NEXT:    [[MUL_I:%.*]] = fmul fast float [[Y_R]], [[X_R]]
+; CHECK-NEXT:    [[MUL4_I:%.*]] = fmul fast float [[Y_I]], [[X_I]]
+; CHECK-NEXT:    [[MUL5_I:%.*]] = fmul fast float [[Y_I]], [[X_R]]
+; CHECK-NEXT:    [[MUL6_I:%.*]] = fmul fast float [[Y_R]], [[X_I]]
+; CHECK-NEXT:    [[SUB_I:%.*]] = fsub fast float [[MUL_I]], [[MUL4_I]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = fadd fast float [[MUL5_I]], [[MUL6_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> undef, float [[Y_R]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[Y_I]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[X_R]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[X_I]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.complex.multiply.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i64 0
+; CHECK-NEXT:    [[RES_R:%.*]] = insertvalue [[COMPLEX_T]] undef, float [[TMP6]], 0
+; CHECK-NEXT:    [[RES_I:%.*]] = insertvalue [[COMPLEX_T]] [[RES_R]], float [[TMP5]], 1
+; CHECK-NEXT:    ret [[COMPLEX_T]] [[RES_I]]
+;
+entry:
+  %x.r.addr = getelementptr inbounds %complex_t, %complex_t* %x, i64 0, i32 0
+  %x.r = load float, float* %x.r.addr, align 4
+  %x.i.addr = getelementptr inbounds %complex_t, %complex_t* %x, i64 0, i32 1
+  %x.i = load float, float* %x.i.addr, align 4
+  %y.r.addr = getelementptr inbounds %complex_t, %complex_t* %y, i64 0, i32 0
+  %y.r = load float, float* %y.r.addr, align 4
+  %y.i.addr = getelementptr inbounds %complex_t, %complex_t* %y, i64 0, i32 1
+  %y.i = load float, float* %y.i.addr, align 4
+  %mul.i = fmul fast float %y.r, %x.r
+  %mul4.i = fmul fast float %y.i, %x.i
+  %mul5.i = fmul fast float %y.i, %x.r
+  %mul6.i = fmul fast float %y.r, %x.i
+  %sub.i = fsub fast float %mul.i, %mul4.i
+  %add.i = fadd fast float %mul5.i, %mul6.i
+  %res.r = insertvalue %complex_t undef, float %sub.i, 0
+  %res.i = insertvalue %complex_t %res.r, float %add.i, 1
+  ret %complex_t %res.i
+}