Index: llvm/lib/Target/AArch64/AArch64.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64.h
+++ llvm/lib/Target/AArch64/AArch64.h
@@ -36,6 +36,7 @@
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64VectorByElementOptPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
@@ -57,6 +58,7 @@
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64VectorByElementOptPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -124,6 +124,11 @@
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<bool>
+    EnableVectorByElement ("enable-vector-by-element-opt", cl::Hidden,
+                           cl::init(true),
+                           cl::desc("Enable vector by element optimization"));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -142,6 +147,7 @@
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
+  initializeAArch64VectorByElementOptPass(*PR);
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
@@ -447,6 +453,8 @@
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
+  if (EnableVectorByElement)
+    addPass(createAArch64VectorByElementOptPass());
   return true;
 }
 
@@ -476,6 +484,7 @@
 void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
+
   // Use load/store pair instructions when possible.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -0,0 +1,328 @@
+//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs optimization for vector by element
+// SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
+
+STATISTIC(NumModifiedInstr, "Number of vector by element instructions modified");
+
+#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
+  "AArch64 vector by element instruction optimization pass"
+
+namespace {
+
+struct AArch64VectorByElementOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+	MachineRegisterInfo   *MRI;
+
+  /// Based only on latency of instructions, determine if it is cost efficient
+  /// to replace the instruction InstDesc by the two instructions InstDescRep1
+  /// and InstDescRep2.
+  /// Return true if replacement is recommended.
+  bool replaceInstruction(MachineFunction *MF,
+                          const MCInstrDesc *InstDesc,
+                          const MCInstrDesc *InstDescRep1,
+                          const MCInstrDesc *InstDescRep2,
+                          std::map<unsigned, bool>& VecInstElemTable) const;
+
+  /// Check whether an equivalent DUP instruction has already been
+  /// created or not.
+  /// Return true when the dup instruction already exists.
+  bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
+                unsigned LaneNumber, unsigned *DestReg) const;
+
+  /// Certain SIMD instructions with vector element operand are not efficient.
+  /// Rewrite them into SIMD instructions with vector operands. This rewrite
+  /// is driven by the latency of the instructions.
+  /// Return true if the SIMD instruction is modified.
+  bool optimizeVectElement(MachineInstr &MI,
+                           std::map<unsigned, bool> *VecInstElemTable) const;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
+  }
+};
+char AArch64VectorByElementOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
+		            AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
+
+/// Based only on latency of instructions, determine if it is cost efficient
+/// to replace the instruction InstDesc by the two instructions InstDescRep1
+/// and InstDescRep2. Note that it is assumed in this fuction that an
+/// instruction of type InstDesc is always replaced by the same two
+/// instructions as results are cached here.
+/// Return true if replacement is recommended.
+bool AArch64VectorByElementOpt::replaceInstruction(MachineFunction *MF,
+       const MCInstrDesc *InstDesc, const MCInstrDesc *InstDescRep1,
+       const MCInstrDesc *InstDescRep2,
+       std::map<unsigned, bool> &VecInstElemTable) const {
+
+  // Check if replacment decision is alredy available in the cached table.
+  // if so, return it.
+  if (!VecInstElemTable.empty() &&
+      VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
+    return VecInstElemTable[InstDesc->getOpcode()];
+
+  unsigned SCIdx     = InstDesc->getSchedClass();
+  unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
+  unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
+
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  const AArch64InstrInfo *TII =
+		static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  if (!TII) return false;
+  TargetSchedModel SchedModel;
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  if (!SchedModel.hasInstrSchedModel())
+    return false;
+
+  const MCSchedClassDesc *SCDesc     =
+		SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+  const MCSchedClassDesc *SCDescRep1 =
+		SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
+  const MCSchedClassDesc *SCDescRep2 =
+		SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
+
+  // If a subtarget does not define resources for any of the instructions
+  // of interest, then return false for no replacement.
+  if (!SCDesc->isValid() || SCDesc->isVariant()
+      || !SCDescRep1->isValid() || SCDescRep1->isVariant()
+      || !SCDescRep2->isValid() || SCDescRep2->isVariant()) {
+      VecInstElemTable[InstDesc->getOpcode()] = false;
+    return false;
+  }
+
+  if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
+      SchedModel.computeInstrLatency(InstDescRep1->getOpcode())
+      + SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
+    VecInstElemTable[InstDesc->getOpcode()] = true;
+    return true;
+  }
+  else {
+    VecInstElemTable[InstDesc->getOpcode()] = false;
+    return false;
+  }
+}
+
+/// Check whether an equivalent DUP instruction has already been
+/// created or not.
+/// Return true when the dup instruction already exists.
+bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
+       unsigned SrcReg, unsigned LaneNumber,
+       unsigned *DestReg) const {
+  for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
+       MII != MIE; ) {
+    MII--;
+    MachineInstr *CurrentMI = &*MII;
+
+    if (CurrentMI->getOpcode() == DupOpcode &&
+        CurrentMI->getNumOperands() == 3 &&
+        CurrentMI->getOperand(1).getReg() == SrcReg &&
+        CurrentMI->getOperand(2).getImm() == LaneNumber) {
+      *DestReg = CurrentMI->getOperand(0).getReg();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Certain SIMD instructions with vector element operand are not efficient.
+/// Rewrite them into SIMD instructions with vector operands. This rewrite
+/// is driven by the latency of the instructions.
+/// The instruction of concerns are for the time being fmla, fmls, fmul,
+/// and fmulx and hence they are hardcoded.
+///
+/// Example:
+///    fmla v0.4s, v1.4s, v2.s[1]
+///    is rewritten into
+///    dup v3.4s, v2.s[1]           // dup not necessary if redundant
+///    fmla v0.4s, v1.4s, v3.4s
+/// Return true if the SIMD instruction is modified.
+bool AArch64VectorByElementOpt::optimizeVectElement(MachineInstr &MI,
+       std::map<unsigned, bool>* VecInstElemTable) const {
+  const MCInstrDesc *MulMCID, *DupMCID;
+  const TargetRegisterClass *RC = &AArch64::FPR128RegClass;;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // 4X32 instructions
+  case AArch64::FMLAv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMLAv4f32);
+    break;
+  case AArch64::FMLSv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMLSv4f32);
+    break;
+  case AArch64::FMULXv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMULXv4f32);
+    break;
+  case AArch64::FMULv4i32_indexed:
+    DupMCID =  &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMULv4f32);
+    break;
+
+  // 2X64 instructions
+  case AArch64::FMLAv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMLAv2f64);
+    break;
+  case AArch64::FMLSv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMLSv2f64);
+    break;
+  case AArch64::FMULXv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMULXv2f64);
+    break;
+  case AArch64::FMULv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMULv2f64);
+    break;
+
+  // 2X32 instructions
+  case AArch64::FMLAv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMLAv2f32);
+    break;
+  case AArch64::FMLSv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMLSv2f32);
+    break;
+  case AArch64::FMULXv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMULXv2f32);
+    break;
+  case AArch64::FMULv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMULv2f32);
+    break;
+  }
+
+  if (!replaceInstruction(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
+                           DupMCID, MulMCID, *VecInstElemTable))
+    return false;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // get the operands of the current SIMD arithmetic instruction.
+  unsigned MulDest = MI.getOperand(0).getReg();
+  unsigned SrcReg0 = MI.getOperand(1).getReg();
+  bool Src0IsKill = MI.getOperand(1).isKill();
+  unsigned SrcReg1 = MI.getOperand(2).getReg();
+  bool Src1IsKill = MI.getOperand(2).isKill();
+  unsigned DupDest;
+
+  // Instructions of interest have either 4 or 5 operands.
+  if (MI.getNumOperands() == 5) {
+    unsigned SrcReg2 = MI.getOperand(3).getReg();
+    bool Src2IsKill = MI.getOperand(3).isKill();
+    unsigned LaneNumber = MI.getOperand(4).getImm();
+
+    // create the new dup instruction (but only if an equivalent
+    // one has not already been created).
+    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
+      DupDest = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+				      .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addImm(LaneNumber);
+    }
+    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+            .addReg(SrcReg0, getKillRegState(Src0IsKill))
+            .addReg(SrcReg1, getKillRegState(Src1IsKill))
+            .addReg(DupDest, getKillRegState(Src2IsKill));
+  }
+  else if (MI.getNumOperands() == 4) {
+    unsigned LaneNumber = MI.getOperand(3).getImm();
+    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
+      DupDest = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(LaneNumber);
+    }
+    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+            .addReg(SrcReg0, getKillRegState(Src0IsKill))
+            .addReg(DupDest, getKillRegState(Src1IsKill));
+  }
+  else
+    return false;
+
+	++NumModifiedInstr;
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+	TRI = MF.getSubtarget().getRegisterInfo();
+	MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+  std::map<unsigned, bool> VecInstElemTable;
+
+	for (MachineBasicBlock &MBB : MF) {
+    SmallPtrSet<MachineInstr*, 16> LocalMIs;
+	  for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+				MII != MIE; ) {
+		   MachineInstr &MI = *MII;
+			 // We may be erasing MI below, increment MII now.
+			 ++MII;
+			 LocalMIs.insert(&MI);
+
+      if (optimizeVectElement(MI, &VecInstElemTable)) {
+        // MI is deleted
+        LocalMIs.erase(&MI);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// createAArch64VectorByElementOptPass - returns an instance of the
+/// vector by element optimization pass.
+FunctionPass *llvm::createAArch64VectorByElementOptPass() {
+  return new AArch64VectorByElementOpt();
+}
Index: llvm/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AArch64/CMakeLists.txt
+++ llvm/lib/Target/AArch64/CMakeLists.txt
@@ -63,6 +63,7 @@
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
+	AArch64VectorByElementOpt.cpp
   ${GLOBAL_ISEL_BUILD_FILES}
 )
 
Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s
 
 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
@@ -382,6 +383,9 @@
 ; CHECK-LABEL: test_vfma_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_lane_f32:
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -394,6 +398,9 @@
 ; CHECK-LABEL: test_vfmaq_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f32:
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -406,6 +413,9 @@
 ; CHECK-LABEL: test_vfma_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_laneq_f32:
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -416,6 +426,9 @@
 ; CHECK-LABEL: test_vfmaq_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f32:
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -426,6 +439,9 @@
 ; CHECK-LABEL: test_vfms_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_lane_f32:
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -437,6 +453,9 @@
 ; CHECK-LABEL: test_vfmsq_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f32:
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -448,6 +467,9 @@
 ; CHECK-LABEL: test_vfms_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_laneq_f32:
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -459,6 +481,9 @@
 ; CHECK-LABEL: test_vfmsq_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f32:
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -470,6 +495,9 @@
 ; CHECK-LABEL: test_vfmaq_lane_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f64:
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -482,6 +510,9 @@
 ; CHECK-LABEL: test_vfmaq_laneq_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f64:
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -492,6 +523,9 @@
 ; CHECK-LABEL: test_vfmsq_lane_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f64:
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <1 x double> <double -0.000000e+00>, %v
   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -503,6 +537,9 @@
 ; CHECK-LABEL: test_vfmsq_laneq_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f64:
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -514,6 +551,9 @@
 ; CHECK-LABEL: test_vfmas_laneq_f32
 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXNOS-LABEL: test_vfmas_laneq_f32
+; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; EXNOS-NEXT: ret
 entry:
   %extract = extractelement <4 x float> %v, i32 3
   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
@@ -539,6 +579,9 @@
 ; CHECK-LABEL: test_vfmss_lane_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmss_lane_f32
+; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x float> %v, i32 1
   %extract = fsub float -0.000000e+00, %extract.rhs
@@ -561,6 +604,9 @@
 ; CHECK-LABEL: test_vfmsd_laneq_f64
 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsd_laneq_f64
+; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x double> %v, i32 1
   %extract = fsub double -0.000000e+00, %extract.rhs
@@ -583,6 +629,9 @@
 ; CHECK-LABEL: test_vfmss_lane_f32_0
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmss_lane_f32_0
+; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS-NEXT: ret
 entry:
   %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %tmp1 = extractelement <2 x float> %tmp0, i32 1
@@ -1408,6 +1457,9 @@
 ; CHECK-LABEL: test_vmul_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f32:
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1418,6 +1470,9 @@
 ; CHECK-LABEL: test_vmul_lane_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f64:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1431,6 +1486,9 @@
 ; CHECK-LABEL: test_vmulq_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f32:
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1441,6 +1499,9 @@
 ; CHECK-LABEL: test_vmulq_lane_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f64:
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -1451,6 +1512,9 @@
 ; CHECK-LABEL: test_vmul_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f32:
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1461,6 +1525,9 @@
 ; CHECK-LABEL: test_vmul_laneq_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f64:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1474,6 +1541,9 @@
 ; CHECK-LABEL: test_vmulq_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f32:
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1484,6 +1554,9 @@
 ; CHECK-LABEL: test_vmulq_laneq_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f64:
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x double> %shuffle, %a
@@ -1494,6 +1567,9 @@
 ; CHECK-LABEL: test_vmulx_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f32:
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1504,6 +1580,9 @@
 ; CHECK-LABEL: test_vmulxq_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f32:
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; Exynos-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1514,6 +1593,9 @@
 ; CHECK-LABEL: test_vmulxq_lane_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f64:
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1524,6 +1606,9 @@
 ; CHECK-LABEL: test_vmulx_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_laneq_f32:
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1534,6 +1619,9 @@
 ; CHECK-LABEL: test_vmulxq_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f32:
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1544,6 +1632,9 @@
 ; CHECK-LABEL: test_vmulxq_laneq_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f64:
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1890,6 +1981,9 @@
 ; CHECK-LABEL: test_vfma_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_lane_f32_0:
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1900,6 +1994,9 @@
 ; CHECK-LABEL: test_vfmaq_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f32_0:
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1910,6 +2007,9 @@
 ; CHECK-LABEL: test_vfma_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_laneq_f32_0:
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1920,6 +2020,9 @@
 ; CHECK-LABEL: test_vfmaq_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f32_0:
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1930,6 +2033,9 @@
 ; CHECK-LABEL: test_vfms_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_lane_f32_0:
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -1941,6 +2047,9 @@
 ; CHECK-LABEL: test_vfmsq_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f32_0:
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -1952,6 +2061,9 @@
 ; CHECK-LABEL: test_vfms_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_laneq_f32_0:
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -1963,6 +2075,9 @@
 ; CHECK-LABEL: test_vfmsq_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f32_0:
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1974,6 +2089,9 @@
 ; CHECK-LABEL: test_vfmaq_laneq_f64_0:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f64_0:
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -1984,6 +2102,9 @@
 ; CHECK-LABEL: test_vfmsq_laneq_f64_0:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f64_0:
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -2787,6 +2908,9 @@
 ; CHECK-LABEL: test_vmul_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f32_0:
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2797,6 +2921,9 @@
 ; CHECK-LABEL: test_vmulq_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f32_0:
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2807,6 +2934,9 @@
 ; CHECK-LABEL: test_vmul_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f32_0:
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2817,6 +2947,9 @@
 ; CHECK-LABEL: test_vmul_laneq_f64_0:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f64_0:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -2830,6 +2963,9 @@
 ; CHECK-LABEL: test_vmulq_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f32_0:
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2840,6 +2976,9 @@
 ; CHECK-LABEL: test_vmulq_laneq_f64_0:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f64_0:
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -2850,6 +2989,9 @@
 ; CHECK-LABEL: test_vmulx_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f32_0:
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2860,6 +3002,9 @@
 ; CHECK-LABEL: test_vmulxq_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f32_0:
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2870,6 +3015,9 @@
 ; CHECK-LABEL: test_vmulxq_lane_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f64_0:
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -2880,6 +3028,9 @@
 ; CHECK-LABEL: test_vmulx_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_laneq_f32_0:
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2890,6 +3041,9 @@
 ; CHECK-LABEL: test_vmulxq_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f32_0:
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2900,6 +3054,9 @@
 ; CHECK-LABEL: test_vmulxq_laneq_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f64_0:
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)