Index: llvm/trunk/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.h +++ llvm/trunk/lib/Target/AArch64/AArch64.h @@ -35,6 +35,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +FunctionPass *createAArch64VectorByElementOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); @@ -55,6 +56,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64VectorByElementOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); Index: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -141,6 +141,7 @@ initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64VectorByElementOptPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -422,6 +423,7 @@ addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); + addPass(createAArch64VectorByElementOptPass()); return true; } Index: llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -0,0 +1,371 @@ +//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs optimization for vector by element +// SIMD instructions. +// +// Certain SIMD instructions with vector element operand are not efficient. +// Rewrite them into SIMD instructions with vector operands. This rewrite +// is driven by the latency of the instructions. +// +// Example: +// fmla v0.4s, v1.4s, v2.s[1] +// is rewritten into +// dup v3.4s, v2.s[1] +// fmla v0.4s, v1.4s, v3.4s +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-vectorbyelement-opt" + +STATISTIC(NumModifiedInstr, + "Number of vector by element instructions modified"); + +#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ + "AArch64 vector by element instruction optimization pass" + +namespace { + +struct AArch64VectorByElementOpt : public MachineFunctionPass { + static char ID; + AArch64VectorByElementOpt() : MachineFunctionPass(ID) { + initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); + } + + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + TargetSchedModel SchedModel; + + /// Based only on latency of instructions, determine if it is cost efficient + /// to replace the instruction InstDesc by the two instructions InstDescRep1 + /// and InstDescRep2. + /// Return true if replacement is recommended. + bool + shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc, + const MCInstrDesc *InstDescRep1, + const MCInstrDesc *InstDescRep2, + std::map &VecInstElemTable) const; + + /// Determine if we need to exit the vector by element instruction + /// optimization pass early. This makes sure that Targets with no need + /// for this optimization do not spent any compile time on this pass. + /// This check is done by comparing the latency of an indexed FMLA + /// instruction to the latency of the DUP + the latency of a vector + /// FMLA instruction. We do not check on other related instructions such + /// as FMLS as we assume that if the situation shows up for one + /// instruction, then it is likely to show up for the related ones. + /// Return true if early exit of the pass is recommended. + bool earlyExitVectElement(MachineFunction *MF); + + /// Check whether an equivalent DUP instruction has already been + /// created or not. + /// Return true when the dup instruction already exists. In this case, + /// DestReg will point to the destination of the already created DUP. + bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, + unsigned LaneNumber, unsigned *DestReg) const; + + /// Certain SIMD instructions with vector element operand are not efficient. + /// Rewrite them into SIMD instructions with vector operands. This rewrite + /// is driven by the latency of the instructions. + /// Return true if the SIMD instruction is modified. + bool optimizeVectElement(MachineInstr &MI, + std::map *VecInstElemTable) const; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { + return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; + } +}; +char AArch64VectorByElementOpt::ID = 0; +} // namespace + +INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", + AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) + +/// Based only on latency of instructions, determine if it is cost efficient +/// to replace the instruction InstDesc by the two instructions InstDescRep1 +/// and InstDescRep2. Note that it is assumed in this fuction that an +/// instruction of type InstDesc is always replaced by the same two +/// instructions as results are cached here. +/// Return true if replacement is recommended. +bool AArch64VectorByElementOpt::shouldReplaceInstruction( + MachineFunction *MF, const MCInstrDesc *InstDesc, + const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2, + std::map &VecInstElemTable) const { + // Check if replacment decision is alredy available in the cached table. + // if so, return it. + if (!VecInstElemTable.empty() && + VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) + return VecInstElemTable[InstDesc->getOpcode()]; + + unsigned SCIdx = InstDesc->getSchedClass(); + unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); + unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); + const MCSchedClassDesc *SCDesc = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + const MCSchedClassDesc *SCDescRep1 = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); + const MCSchedClassDesc *SCDescRep2 = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); + + // If a subtarget does not define resources for any of the instructions + // of interest, then return false for no replacement. + if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() || + SCDescRep1->isVariant() || !SCDescRep2->isValid() || + SCDescRep2->isVariant()) { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } + + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > + SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + + SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { + VecInstElemTable[InstDesc->getOpcode()] = true; + return true; + } + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; +} + +/// Determine if we need to exit the vector by element instruction +/// optimization pass early. This makes sure that Targets with no need +/// for this optimization do not spent any compile time on this pass. +/// This check is done by comparing the latency of an indexed FMLA +/// instruction to the latency of the DUP + the latency of a vector +/// FMLA instruction. We do not check on other related instructions such +/// as FMLS as we assume that if the situation shows up for one +/// instruction, then it is likely to show up for the related ones. +/// Return true if early exit of the pass is recommended. +bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) { + std::map VecInstElemTable; + const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed); + const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane); + const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32); + + if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID, + VecInstElemTable)) + return true; + return false; +} + +/// Check whether an equivalent DUP instruction has already been +/// created or not. +/// Return true when the dup instruction already exists. In this case, +/// DestReg will point to the destination of the already created DUP. +bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, + unsigned SrcReg, unsigned LaneNumber, + unsigned *DestReg) const { + for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); + MII != MIE;) { + MII--; + MachineInstr *CurrentMI = &*MII; + + if (CurrentMI->getOpcode() == DupOpcode && + CurrentMI->getNumOperands() == 3 && + CurrentMI->getOperand(1).getReg() == SrcReg && + CurrentMI->getOperand(2).getImm() == LaneNumber) { + *DestReg = CurrentMI->getOperand(0).getReg(); + return true; + } + } + + return false; +} + +/// Certain SIMD instructions with vector element operand are not efficient. +/// Rewrite them into SIMD instructions with vector operands. This rewrite +/// is driven by the latency of the instructions. +/// The instruction of concerns are for the time being fmla, fmls, fmul, +/// and fmulx and hence they are hardcoded. +/// +/// Example: +/// fmla v0.4s, v1.4s, v2.s[1] +/// is rewritten into +/// dup v3.4s, v2.s[1] // dup not necessary if redundant +/// fmla v0.4s, v1.4s, v3.4s +/// Return true if the SIMD instruction is modified. +bool AArch64VectorByElementOpt::optimizeVectElement( + MachineInstr &MI, std::map *VecInstElemTable) const { + const MCInstrDesc *MulMCID, *DupMCID; + const TargetRegisterClass *RC = &AArch64::FPR128RegClass; + + switch (MI.getOpcode()) { + default: + return false; + + // 4X32 instructions + case AArch64::FMLAv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLAv4f32); + break; + case AArch64::FMLSv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLSv4f32); + break; + case AArch64::FMULXv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULXv4f32); + break; + case AArch64::FMULv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULv4f32); + break; + + // 2X64 instructions + case AArch64::FMLAv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLAv2f64); + break; + case AArch64::FMLSv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLSv2f64); + break; + case AArch64::FMULXv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULXv2f64); + break; + case AArch64::FMULv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULv2f64); + break; + + // 2X32 instructions + case AArch64::FMLAv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLAv2f32); + break; + case AArch64::FMLSv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLSv2f32); + break; + case AArch64::FMULXv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULXv2f32); + break; + case AArch64::FMULv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULv2f32); + break; + } + + if (!shouldReplaceInstruction(MI.getParent()->getParent(), + &TII->get(MI.getOpcode()), DupMCID, MulMCID, + *VecInstElemTable)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // get the operands of the current SIMD arithmetic instruction. + unsigned MulDest = MI.getOperand(0).getReg(); + unsigned SrcReg0 = MI.getOperand(1).getReg(); + unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); + unsigned SrcReg1 = MI.getOperand(2).getReg(); + unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); + unsigned DupDest; + + // Instructions of interest have either 4 or 5 operands. + if (MI.getNumOperands() == 5) { + unsigned SrcReg2 = MI.getOperand(3).getReg(); + unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); + unsigned LaneNumber = MI.getOperand(4).getImm(); + + // Create a new DUP instruction. Note that if an equivalent DUP instruction + // has already been created before, then use that one instread of creating + // a new one. + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg2, Src2IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(SrcReg1, Src1IsKill) + .addReg(DupDest, Src2IsKill); + } else if (MI.getNumOperands() == 4) { + unsigned LaneNumber = MI.getOperand(3).getImm(); + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg1, Src1IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(DupDest, Src1IsKill); + } else { + return false; + } + + ++NumModifiedInstr; + return true; +} + +bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const AArch64InstrInfo *AAII = + static_cast(ST.getInstrInfo()); + if (!AAII) + return false; + SchedModel.init(ST.getSchedModel(), &ST, AAII); + if (!SchedModel.hasInstrSchedModel()) + return false; + + // A simple check to exit this pass early for targets that do not need it. + if (earlyExitVectElement(&MF)) + return false; + + bool Changed = false; + std::map VecInstElemTable; + SmallVector RemoveMIs; + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII; + if (optimizeVectElement(MI, &VecInstElemTable)) { + // Add MI to the list of instructions to be removed given that it has + // been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; + } + } + + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); + + return Changed; +} + +/// createAArch64VectorByElementOptPass - returns an instance of the +/// vector by element optimization pass. +FunctionPass *llvm::createAArch64VectorByElementOptPass() { + return new AArch64VectorByElementOpt(); +} Index: llvm/trunk/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AArch64/CMakeLists.txt +++ llvm/trunk/lib/Target/AArch64/CMakeLists.txt @@ -62,6 +62,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + AArch64VectorByElementOpt.cpp ${GLOBAL_ISEL_BUILD_FILES} ) Index: llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -382,6 +384,10 @@ ; CHECK-LABEL: test_vfma_lane_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -394,6 +400,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -406,6 +416,10 @@ ; CHECK-LABEL: test_vfma_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -416,6 +430,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -426,6 +444,10 @@ ; CHECK-LABEL: test_vfms_lane_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -437,6 +459,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -448,6 +474,10 @@ ; CHECK-LABEL: test_vfms_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -459,6 +489,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -470,6 +504,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -482,6 +520,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -492,6 +534,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -503,6 +549,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -514,6 +564,9 @@ ; CHECK-LABEL: test_vfmas_laneq_f32 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXNOS-LABEL: test_vfmas_laneq_f32 +; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; EXNOS-NEXT: ret entry: %extract = extractelement <4 x float> %v, i32 3 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -539,6 +592,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 %extract = fsub float -0.000000e+00, %extract.rhs @@ -561,6 +617,9 @@ ; CHECK-LABEL: test_vfmsd_laneq_f64 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsd_laneq_f64 +; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 %extract = fsub double -0.000000e+00, %extract.rhs @@ -583,6 +642,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32_0 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -1408,6 +1470,10 @@ ; CHECK-LABEL: test_vmul_lane_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1418,6 +1484,9 @@ ; CHECK-LABEL: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1431,6 +1500,10 @@ ; CHECK-LABEL: test_vmulq_lane_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1441,6 +1514,10 @@ ; CHECK-LABEL: test_vmulq_lane_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1451,6 +1528,10 @@ ; CHECK-LABEL: test_vmul_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1461,6 +1542,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1474,6 +1558,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1484,6 +1572,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1494,6 +1586,10 @@ ; CHECK-LABEL: test_vmulx_lane_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1504,6 +1600,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; Exynos-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1514,6 +1614,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1524,6 +1628,10 @@ ; CHECK-LABEL: test_vmulx_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1534,6 +1642,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1544,6 +1656,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1890,6 +2006,10 @@ ; CHECK-LABEL: test_vfma_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1900,6 +2020,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1910,6 +2034,10 @@ ; CHECK-LABEL: test_vfma_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1920,6 +2048,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1930,6 +2062,10 @@ ; CHECK-LABEL: test_vfms_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -1941,6 +2077,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -1952,6 +2092,10 @@ ; CHECK-LABEL: test_vfms_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -1963,6 +2107,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -1974,6 +2122,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64_0: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -1984,6 +2136,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64_0: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2787,6 +2943,10 @@ ; CHECK-LABEL: test_vmul_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2797,6 +2957,10 @@ ; CHECK-LABEL: test_vmulq_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2807,6 +2971,10 @@ ; CHECK-LABEL: test_vmul_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2817,6 +2985,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64_0: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2830,6 +3001,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2840,6 +3015,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f64_0: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -2850,6 +3029,10 @@ ; CHECK-LABEL: test_vmulx_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2860,6 +3043,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2870,6 +3057,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2880,6 +3071,10 @@ ; CHECK-LABEL: test_vmulx_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2890,6 +3085,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2900,9 +3099,51 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) ret <2 x double> %vmulx2.i } +define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { +; CHECK-LABEL: optimize_dup: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret +; EXYNOS-LABEL: optimize_dup: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret +entry: + %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) + %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %1 = fmul <4 x float> %lane2, %c + %s = fsub <4 x float> %0, %1 + ret <4 x float> %s +} + +define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { +; CHECK-LABEL: no_optimize_dup: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: no_optimize_dup: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s +; EXYNOS-NEXT: ret +entry: + %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) + %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %1 = fmul <4 x float> %lane2, %c + %s = fsub <4 x float> %0, %1 + ret <4 x float> %s +}