Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -264,6 +264,50 @@
     DupMCID = &TII->get(AArch64::DUPv2i32lane);
     MulMCID = &TII->get(AArch64::FMULv2f32);
     break;
+
+  // 1X64 instructions
+  case AArch64::FMLAv1i64_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::CPYi64);
+    MulMCID = &TII->get(AArch64::FMADDDrrr);
+    break;
+  case AArch64::FMLSv1i64_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::CPYi64);
+    MulMCID = &TII->get(AArch64::FMSUBDrrr);
+    break;
+  case AArch64::FMULXv1i64_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::CPYi64);
+    MulMCID = &TII->get(AArch64::FMULX64);
+    break;
+  case AArch64::FMULv1i64_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::CPYi64);
+    MulMCID = &TII->get(AArch64::FMULDrr);
+    break;
+
+  // 1X32 instructions
+  case AArch64::FMLAv1i32_indexed:
+    RC = &AArch64::FPR32RegClass;
+    DupMCID = &TII->get(AArch64::CPYi32);
+    MulMCID = &TII->get(AArch64::FMADDSrrr);
+    break;
+  case AArch64::FMLSv1i32_indexed:
+    RC = &AArch64::FPR32RegClass;
+    DupMCID = &TII->get(AArch64::CPYi32);
+    MulMCID = &TII->get(AArch64::FMSUBSrrr);
+    break;
+  case AArch64::FMULXv1i32_indexed:
+    RC = &AArch64::FPR32RegClass;
+    DupMCID = &TII->get(AArch64::CPYi32);
+    MulMCID = &TII->get(AArch64::FMULX32);
+    break;
+  case AArch64::FMULv1i32_indexed:
+    RC = &AArch64::FPR32RegClass;
+    DupMCID = &TII->get(AArch64::CPYi32);
+    MulMCID = &TII->get(AArch64::FMULSrr);
+    break;
   }
 
   if (!shouldReplaceInstruction(MI.getParent()->getParent(),
@@ -298,10 +342,26 @@
           .addReg(SrcReg2, Src2IsKill)
           .addImm(LaneNumber);
     }
-    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
-        .addReg(SrcReg0, Src0IsKill)
-        .addReg(SrcReg1, Src1IsKill)
-        .addReg(DupDest, Src2IsKill);
+		// Create the new non-indexed instruction
+		switch (MI.getOpcode()) {
+			default:
+				// for most SIMD instructions, use this to create instruction 
+				BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+					.addReg(SrcReg0, Src0IsKill)
+					.addReg(SrcReg1, Src1IsKill)
+					.addReg(DupDest, Src2IsKill);
+				break;
+			case AArch64::FMLAv1i32_indexed:
+			case AArch64::FMLSv1i32_indexed:
+			case AArch64::FMLAv1i64_indexed:
+			case AArch64::FMLSv1i64_indexed:
+				// for floating point fmadd/fmsub instructions, use this
+				BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+					.addReg(SrcReg1, Src1IsKill)
+					.addReg(DupDest, Src2IsKill)
+					.addReg(SrcReg0, Src0IsKill);
+				break;
+		}
   } else if (MI.getNumOperands() == 4) {
     unsigned LaneNumber = MI.getOperand(3).getImm();
     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -8,6 +8,10 @@
 
 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
+
 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
@@ -530,6 +534,19 @@
   ret <2 x double> %0
 }
 
+define double @test_vfmla_lane_f64(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmla_lane_f64:
+; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmla_lane_f64
+; EXYNOS: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; EXYNOS-NEXT: ret
+entry:
+  %extract = extractelement <2 x double> %v, i32 1
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
 ; CHECK-LABEL: test_vfmsq_lane_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
@@ -593,7 +610,7 @@
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 ; EXYNOS-LABEL: test_vfmss_lane_f32
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x float> %v, i32 1
@@ -618,7 +635,7 @@
 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 ; EXYNOS-LABEL: test_vfmsd_laneq_f64
-; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x double> %v, i32 1
@@ -643,7 +660,7 @@
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 ; EXYNOS-LABEL: test_vfmss_lane_f32_0
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; EXYNOS-NEXT: ret
 entry:
   %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
@@ -1543,7 +1560,7 @@
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 ; EXYNOS-LABEL: test_vmul_laneq_f64:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
@@ -2058,6 +2075,19 @@
   ret <4 x float> %0
 }
 
+define float @test_vfmla_lane_f32(float %a, float%b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmla_lane_f32:
+; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmla_lane_f32
+; EXYNOS: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; EXYNOS-NEXT: ret
+entry:
+  %extract = extractelement <2 x float> %v, i32 1
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfms_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
@@ -2986,7 +3016,7 @@
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 ; EXYNOS-LABEL: test_vmul_laneq_f64_0:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
@@ -3109,6 +3139,32 @@
   ret <2 x double> %vmulx2.i
 }
 
+define double @test_vmulx_lane_f64(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulx_lane_f64:
+; CHECK: mulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f64:
+; EXYNOS: mulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; EXYNOS-NEXT: ret
+entry:
+  %extract = extractelement <2 x double> %v, i32 1
+  %0 = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %extract)
+  ret double %0
+}
+
+define float @test_vmulx_lane_f32_1(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulx_lane_f32_1:
+; CHECK: mulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f32_1:
+; EXYNOS: mulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; EXYNOS-NEXT: ret
+entry:
+  %extract = extractelement <2 x float> %v, i32 1
+  %0 = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %extract)
+  ret float %0
+}
+
 define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
 ; CHECK-LABEL: optimize_dup:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]