Index: llvm/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/Target/TargetInstrInfo.h +++ llvm/include/llvm/Target/TargetInstrInfo.h @@ -1181,6 +1181,16 @@ const MachineRegisterInfo *MRI) const { return false; } + + /// Certain SIMD instructions with vector element operand are not efficient. + /// Rewrite them into SIMD instructions with vector operands. + /// Return true if the SIMD instruction is modified. + virtual bool optimizeVectElement(MachineInstr &MI, + std::map* + VecInstElemTable) const { + return false; + } + virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; } /// Try to remove the load by folding it to a register operand at the use. Index: llvm/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -157,6 +157,8 @@ bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl &LocalMIs); + bool optimizeVectElement(MachineInstr *MI, + std::map*VecInstElemTable); bool findNextSource(unsigned Reg, unsigned SubReg, RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, @@ -602,6 +604,13 @@ return TII->optimizeCondBranch(*MI); } +/// \brief Check if more efficient instructions can be generated. +bool PeepholeOptimizer::optimizeVectElement(MachineInstr *MI, + std::map* + VecInstElemTable) { + return TII->optimizeVectElement(*MI, VecInstElemTable); +} + /// \brief Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. /// When true is returned, the \p RewriteMap can be used by the client to @@ -1486,6 +1495,7 @@ DT = Aggressive ? &getAnalysis() : nullptr; bool Changed = false; + std::map VecInstElemTable; for (MachineBasicBlock &MBB : MF) { bool SeenMoveImm = false; @@ -1595,6 +1605,14 @@ continue; } + if (optimizeVectElement(MI, &VecInstElemTable)) { + // MI is deleted + LocalMIs.erase(MI); + Changed = true; + continue; + } + + if (MI->isCopy() && (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) || foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -197,6 +197,28 @@ bool expandPostRAPseudo(MachineInstr &MI) const override; + /// Based only on latency of instructions, determine if it is cost efficient + /// to replace the instruction InstDesc by the two instructions InstDescRep1 + /// and InstDescRep2. + /// Return true if replacement is recommended. + bool replaceInstruction(MachineFunction *MF, + const MCInstrDesc *InstDesc, const MCInstrDesc *InstDescRep1, + const MCInstrDesc *InstDescRep2, + std::map& VecInstElemTable) const; + + /// Check whether an equivalent DUP instruction has already been + /// created or not. + /// Return true when the dup instruction already exists. + bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, + unsigned LaneNumber, unsigned *DestReg) const; + + /// Certain SIMD instructions with vector element operand are not efficient. + /// Rewrite them into SIMD instructions with vector operands. This rewrite + /// is driven by the latency of the instructions. + /// Return true if the SIMD instruction is modified. + bool optimizeVectElement(MachineInstr &MI, + std::map *VecInstElemTable) const override; + std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; ArrayRef> Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/ErrorHandling.h" @@ -751,6 +752,219 @@ return false; } +/// Based only on latency of instructions, determine if it is cost efficient +/// to replace the instruction InstDesc by the two instructions InstDescRep1 +/// and InstDescRep2. Note that it is assumed in this fuction that an +/// instruction of type InstDesc is always replaced by the same two +/// instructions as results are cached here. +/// Return true if replacement is recommended. +bool AArch64InstrInfo::replaceInstruction(MachineFunction *MF, + const MCInstrDesc *InstDesc, const MCInstrDesc *InstDescRep1, + const MCInstrDesc *InstDescRep2, std::map &VecInstElemTable) + const { + + // Check if replacment decision is already available in the cached table. + // if so, return it. + if (!VecInstElemTable.empty() && + VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) + return VecInstElemTable[InstDesc->getOpcode()]; + + unsigned SCIdx = InstDesc->getSchedClass(); + unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); + unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); + + const TargetSubtargetInfo &ST = MF->getSubtarget(); + const AArch64InstrInfo *TII = static_cast(ST.getInstrInfo()); + if (!TII) return false; + TargetSchedModel SchedModel; + SchedModel.init(ST.getSchedModel(), &ST, TII); + if (!SchedModel.hasInstrSchedModel()) + return false; + + const MCSchedClassDesc *SCDesc = SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + const MCSchedClassDesc *SCDescRep1 = SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); + const MCSchedClassDesc *SCDescRep2 = SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); + + // If a subtarget does not define resources for any of the instructions + // of interest, then return false for no replacement. + if (!SCDesc->isValid() || SCDesc->isVariant() + || !SCDescRep1->isValid() || SCDescRep1->isVariant() + || !SCDescRep2->isValid() || SCDescRep2->isVariant()) { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } + + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > + SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + + SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { + VecInstElemTable[InstDesc->getOpcode()] = true; + return true; + } + else { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } +} + +/// Check whether an equivalent DUP instruction has already been +/// created or not. +/// Return true when the dup instruction already exists. +bool AArch64InstrInfo::reuseDUP(MachineInstr &MI, unsigned DupOpcode, + unsigned SrcReg, unsigned LaneNumber, + unsigned *DestReg) const { + for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); + MII != MIE; ) { + MII--; + MachineInstr *CurrentMI = &*MII; + + if (CurrentMI->getOpcode() == DupOpcode && + CurrentMI->getNumOperands() == 3 && + CurrentMI->getOperand(1).getReg() == SrcReg && + CurrentMI->getOperand(2).getImm() == LaneNumber) { + *DestReg = CurrentMI->getOperand(0).getReg(); + return true; + } + } + + return false; +} + +/// Certain SIMD instructions with vector element operand are not efficient. +/// Rewrite them into SIMD instructions with vector operands. This rewrite +/// is driven by the latency of the instructions. +/// The instruction of concerns are for the time being fmla, fmls, fmul, +/// and fmulx and hence they are hardcoded. +/// +/// Example: +/// fmla v0.4s, v1.4s, v2.s[1] +/// is rewritten into +/// dup v3.4s, v2.s[1] // dup not necessary if redundant +/// fmla v0.4s, v1.4s, v3.4s +/// Return true if the SIMD instruction is modified. +bool AArch64InstrInfo::optimizeVectElement(MachineInstr &MI, + std::map* + VecInstElemTable) const { + + const MCInstrDesc *MulMCID, *DupMCID; + const TargetRegisterClass *RC = &AArch64::FPR128RegClass;; + + switch (MI.getOpcode()) { + default: + return false; + + // 4X32 instructions + case AArch64::FMLAv4i32_indexed: + DupMCID = &get(AArch64::DUPv4i32lane); + MulMCID = &get(AArch64::FMLAv4f32); + break; + case AArch64::FMLSv4i32_indexed: + DupMCID = &get(AArch64::DUPv4i32lane); + MulMCID = &get(AArch64::FMLSv4f32); + break; + case AArch64::FMULXv4i32_indexed: + DupMCID = &get(AArch64::DUPv4i32lane); + MulMCID = &get(AArch64::FMULXv4f32); + break; + case AArch64::FMULv4i32_indexed: + DupMCID = &get(AArch64::DUPv4i32lane); + MulMCID = &get(AArch64::FMULv4f32); + break; + + // 2X64 instructions + case AArch64::FMLAv2i64_indexed: + DupMCID = &get(AArch64::DUPv2i64lane); + MulMCID = &get(AArch64::FMLAv2f64); + break; + case AArch64::FMLSv2i64_indexed: + DupMCID = &get(AArch64::DUPv2i64lane); + MulMCID = &get(AArch64::FMLSv2f64); + break; + case AArch64::FMULXv2i64_indexed: + DupMCID = &get(AArch64::DUPv2i64lane); + MulMCID = &get(AArch64::FMULXv2f64); + break; + case AArch64::FMULv2i64_indexed: + DupMCID = &get(AArch64::DUPv2i64lane); + MulMCID = &get(AArch64::FMULv2f64); + break; + + // 2X32 instructions + case AArch64::FMLAv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &get(AArch64::DUPv2i32lane); + MulMCID = &get(AArch64::FMLAv2f32); + break; + case AArch64::FMLSv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &get(AArch64::DUPv2i32lane); + MulMCID = &get(AArch64::FMLSv2f32); + break; + case AArch64::FMULXv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &get(AArch64::DUPv2i32lane); + MulMCID = &get(AArch64::FMULXv2f32); + break; + case AArch64::FMULv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &get(AArch64::DUPv2i32lane); + MulMCID = &get(AArch64::FMULv2f32); + break; + } + + if (!replaceInstruction(MI.getParent()->getParent(), &get(MI.getOpcode()), + DupMCID, MulMCID, *VecInstElemTable)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // get the operands of the current SIMD arithmetic instruction. + unsigned MulDest = MI.getOperand(0).getReg(); + unsigned SrcReg0 = MI.getOperand(1).getReg(); + bool Src0IsKill = MI.getOperand(1).isKill(); + unsigned SrcReg1 = MI.getOperand(2).getReg(); + bool Src1IsKill = MI.getOperand(2).isKill(); + unsigned DupDest; + + // Instructions of interest have either 4 or 5 operands. + if (MI.getNumOperands() == 5) { + unsigned SrcReg2 = MI.getOperand(3).getReg(); + bool Src2IsKill = MI.getOperand(3).isKill(); + unsigned LaneNumber = MI.getOperand(4).getImm(); + + // create the new dup instruction (but only if an equivalent + // one has not already been created). + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(DupDest, getKillRegState(Src2IsKill)); + } + else if (MI.getNumOperands() == 4) { + unsigned LaneNumber = MI.getOperand(3).getImm(); + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(DupDest, getKillRegState(Src1IsKill)); + } + else + return false; + + MI.eraseFromParent(); + return true; +} + static bool UpdateOperandRegClass(MachineInstr &Instr) { MachineBasicBlock *MBB = Instr.getParent(); assert(MBB && "Can't get MachineBasicBlock here"); Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -382,6 +383,9 @@ ; CHECK-LABEL: test_vfma_lane_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32: +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -394,6 +398,9 @@ ; CHECK-LABEL: test_vfmaq_lane_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32: +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -406,6 +413,9 @@ ; CHECK-LABEL: test_vfma_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32: +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -416,6 +426,9 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32: +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -426,6 +439,9 @@ ; CHECK-LABEL: test_vfms_lane_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32: +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -437,6 +453,9 @@ ; CHECK-LABEL: test_vfmsq_lane_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32: +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -448,6 +467,9 @@ ; CHECK-LABEL: test_vfms_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32: +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -459,6 +481,9 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32: +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -470,6 +495,9 @@ ; CHECK-LABEL: test_vfmaq_lane_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f64: +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -482,6 +510,9 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64: +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -492,6 +523,9 @@ ; CHECK-LABEL: test_vfmsq_lane_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f64: +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -503,6 +537,9 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64: +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -514,6 +551,9 @@ ; CHECK-LABEL: test_vfmas_laneq_f32 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXNOS-LABEL: test_vfmas_laneq_f32 +; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; EXNOS-NEXT: ret entry: %extract = extractelement <4 x float> %v, i32 3 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -539,6 +579,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 %extract = fsub float -0.000000e+00, %extract.rhs @@ -561,6 +604,9 @@ ; CHECK-LABEL: test_vfmsd_laneq_f64 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsd_laneq_f64 +; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 %extract = fsub double -0.000000e+00, %extract.rhs @@ -583,6 +629,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32_0 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -1408,6 +1457,9 @@ ; CHECK-LABEL: test_vmul_lane_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32: +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1418,6 +1470,9 @@ ; CHECK-LABEL: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1431,6 +1486,9 @@ ; CHECK-LABEL: test_vmulq_lane_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32: +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1441,6 +1499,9 @@ ; CHECK-LABEL: test_vmulq_lane_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f64: +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1451,6 +1512,9 @@ ; CHECK-LABEL: test_vmul_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32: +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1461,6 +1525,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1474,6 +1541,9 @@ ; CHECK-LABEL: test_vmulq_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32: +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1484,6 +1554,9 @@ ; CHECK-LABEL: test_vmulq_laneq_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64: +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1494,6 +1567,9 @@ ; CHECK-LABEL: test_vmulx_lane_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32: +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1504,6 +1580,9 @@ ; CHECK-LABEL: test_vmulxq_lane_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32: +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; Exynos-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1514,6 +1593,9 @@ ; CHECK-LABEL: test_vmulxq_lane_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64: +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1524,6 +1606,9 @@ ; CHECK-LABEL: test_vmulx_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32: +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1534,6 +1619,9 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32: +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1544,6 +1632,9 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64: +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1890,6 +1981,9 @@ ; CHECK-LABEL: test_vfma_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32_0: +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1900,6 +1994,9 @@ ; CHECK-LABEL: test_vfmaq_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32_0: +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1910,6 +2007,9 @@ ; CHECK-LABEL: test_vfma_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32_0: +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1920,6 +2020,9 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1930,6 +2033,9 @@ ; CHECK-LABEL: test_vfms_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32_0: +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -1941,6 +2047,9 @@ ; CHECK-LABEL: test_vfmsq_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32_0: +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -1952,6 +2061,9 @@ ; CHECK-LABEL: test_vfms_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32_0: +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -1963,6 +2075,9 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -1974,6 +2089,9 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64_0: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -1984,6 +2102,9 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64_0: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2787,6 +2908,9 @@ ; CHECK-LABEL: test_vmul_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32_0: +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2797,6 +2921,9 @@ ; CHECK-LABEL: test_vmulq_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32_0: +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2807,6 +2934,9 @@ ; CHECK-LABEL: test_vmul_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32_0: +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2817,6 +2947,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64_0: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2830,6 +2963,9 @@ ; CHECK-LABEL: test_vmulq_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32_0: +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2840,6 +2976,9 @@ ; CHECK-LABEL: test_vmulq_laneq_f64_0: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64_0: +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -2850,6 +2989,9 @@ ; CHECK-LABEL: test_vmulx_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32_0: +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2860,6 +3002,9 @@ ; CHECK-LABEL: test_vmulxq_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32_0: +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2870,6 +3015,9 @@ ; CHECK-LABEL: test_vmulxq_lane_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64_0: +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2880,6 +3028,9 @@ ; CHECK-LABEL: test_vmulx_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32_0: +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2890,6 +3041,9 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2900,6 +3054,9 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)