Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -36,6 +36,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +FunctionPass *createAArch64VectorByElementOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); @@ -57,6 +58,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64VectorByElementOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -124,6 +124,11 @@ cl::desc("Enable the loop data prefetch pass"), cl::init(true)); +static cl::opt + EnableVectorByElement ("enable-vector-by-element-opt", cl::Hidden, + cl::init(true), + cl::desc("Enable vector by element optimization")); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -142,6 +147,7 @@ initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64VectorByElementOptPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -447,6 +453,8 @@ addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); + if (EnableVectorByElement) + addPass(createAArch64VectorByElementOptPass()); return true; } @@ -476,6 +484,7 @@ void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); + // Use load/store pair instructions when possible. if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -0,0 +1,330 @@ +//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs optimization for vector by element +// SIMD instructions. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-vectorbyelement-opt" + +STATISTIC(NumModifiedInstr, "Number of vector by element instructions modified"); + +#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ + "AArch64 vector by element instruction optimization pass" + +namespace { + +struct AArch64VectorByElementOpt : public MachineFunctionPass { + static char ID; + AArch64VectorByElementOpt() : MachineFunctionPass(ID) { + initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); + } + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + /// Based only on latency of instructions, determine if it is cost efficient + /// to replace the instruction InstDesc by the two instructions InstDescRep1 + /// and InstDescRep2. + /// Return true if replacement is recommended. + bool shouldReplaceInstruction(MachineFunction *MF, + const MCInstrDesc *InstDesc, + const MCInstrDesc *InstDescRep1, + const MCInstrDesc *InstDescRep2, + std::map& VecInstElemTable) + const; + + /// Check whether an equivalent DUP instruction has already been + /// created or not. + /// Return true when the dup instruction already exists. In this case, + /// DestReg will point to the destination of the already created DUP. + bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, + unsigned LaneNumber, unsigned *DestReg) const; + + /// Certain SIMD instructions with vector element operand are not efficient. + /// Rewrite them into SIMD instructions with vector operands. This rewrite + /// is driven by the latency of the instructions. + /// Return true if the SIMD instruction is modified. + bool optimizeVectElement(MachineInstr &MI, + std::map *VecInstElemTable) const; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; + } +}; +char AArch64VectorByElementOpt::ID = 0; +} // namespace + +INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", + AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) + +/// Based only on latency of instructions, determine if it is cost efficient +/// to replace the instruction InstDesc by the two instructions InstDescRep1 +/// and InstDescRep2. Note that it is assumed in this fuction that an +/// instruction of type InstDesc is always replaced by the same two +/// instructions as results are cached here. +/// Return true if replacement is recommended. +bool AArch64VectorByElementOpt::shouldReplaceInstruction(MachineFunction *MF, + const MCInstrDesc *InstDesc, const MCInstrDesc *InstDescRep1, + const MCInstrDesc *InstDescRep2, + std::map &VecInstElemTable) const { + + // Check if replacment decision is alredy available in the cached table. + // if so, return it. + if (!VecInstElemTable.empty() && + VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) + return VecInstElemTable[InstDesc->getOpcode()]; + + unsigned SCIdx = InstDesc->getSchedClass(); + unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); + unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); + + const TargetSubtargetInfo &ST = MF->getSubtarget(); + const AArch64InstrInfo *TII = + static_cast(ST.getInstrInfo()); + if (!TII) return false; + TargetSchedModel SchedModel; + SchedModel.init(ST.getSchedModel(), &ST, TII); + if (!SchedModel.hasInstrSchedModel()) + return false; + + const MCSchedClassDesc *SCDesc = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + const MCSchedClassDesc *SCDescRep1 = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); + const MCSchedClassDesc *SCDescRep2 = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); + + // If a subtarget does not define resources for any of the instructions + // of interest, then return false for no replacement. + if (!SCDesc->isValid() || SCDesc->isVariant() + || !SCDescRep1->isValid() || SCDescRep1->isVariant() + || !SCDescRep2->isValid() || SCDescRep2->isVariant()) { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } + + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > + SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + + SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { + VecInstElemTable[InstDesc->getOpcode()] = true; + return true; + } else { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } +} + +/// Check whether an equivalent DUP instruction has already been +/// created or not. +/// Return true when the dup instruction already exists. In this case, +/// DestReg will point to the destination of the already created DUP. +bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, + unsigned SrcReg, unsigned LaneNumber, + unsigned *DestReg) const { + for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); + MII != MIE; ) { + MII--; + MachineInstr *CurrentMI = &*MII; + + if (CurrentMI->getOpcode() == DupOpcode && + CurrentMI->getNumOperands() == 3 && + CurrentMI->getOperand(1).getReg() == SrcReg && + CurrentMI->getOperand(2).getImm() == LaneNumber) { + *DestReg = CurrentMI->getOperand(0).getReg(); + return true; + } + } + + return false; +} + +/// Certain SIMD instructions with vector element operand are not efficient. +/// Rewrite them into SIMD instructions with vector operands. This rewrite +/// is driven by the latency of the instructions. +/// The instruction of concerns are for the time being fmla, fmls, fmul, +/// and fmulx and hence they are hardcoded. +/// +/// Example: +/// fmla v0.4s, v1.4s, v2.s[1] +/// is rewritten into +/// dup v3.4s, v2.s[1] // dup not necessary if redundant +/// fmla v0.4s, v1.4s, v3.4s +/// Return true if the SIMD instruction is modified. +bool AArch64VectorByElementOpt::optimizeVectElement(MachineInstr &MI, + std::map* VecInstElemTable) const { + const MCInstrDesc *MulMCID, *DupMCID; + const TargetRegisterClass *RC = &AArch64::FPR128RegClass;; + + switch (MI.getOpcode()) { + default: + return false; + + // 4X32 instructions + case AArch64::FMLAv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLAv4f32); + break; + case AArch64::FMLSv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLSv4f32); + break; + case AArch64::FMULXv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULXv4f32); + break; + case AArch64::FMULv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULv4f32); + break; + + // 2X64 instructions + case AArch64::FMLAv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLAv2f64); + break; + case AArch64::FMLSv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLSv2f64); + break; + case AArch64::FMULXv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULXv2f64); + break; + case AArch64::FMULv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULv2f64); + break; + + // 2X32 instructions + case AArch64::FMLAv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLAv2f32); + break; + case AArch64::FMLSv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLSv2f32); + break; + case AArch64::FMULXv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULXv2f32); + break; + case AArch64::FMULv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULv2f32); + break; + } + + if (!shouldReplaceInstruction(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), + DupMCID, MulMCID, *VecInstElemTable)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // get the operands of the current SIMD arithmetic instruction. + unsigned MulDest = MI.getOperand(0).getReg(); + unsigned SrcReg0 = MI.getOperand(1).getReg(); + unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); + unsigned SrcReg1 = MI.getOperand(2).getReg(); + unsigned Src1IsKill = getKillRegState (MI.getOperand(2).isKill()); + unsigned DupDest; + + // Instructions of interest have either 4 or 5 operands. + if (MI.getNumOperands() == 5) { + unsigned SrcReg2 = MI.getOperand(3).getReg(); + unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); + unsigned LaneNumber = MI.getOperand(4).getImm(); + + // Create a new DUP instruction. Note that if an equivalent DUP instruction + // has already been created before, then use that one instread of creating + // a new one. + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg2, Src2IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(SrcReg1, Src1IsKill) + .addReg(DupDest, Src2IsKill); + } else if (MI.getNumOperands() == 4) { + unsigned LaneNumber = MI.getOperand(3).getImm(); + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg1, Src1IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(DupDest, Src1IsKill); + } else { + return false; + } + + ++NumModifiedInstr; + return true; +} + +bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = false; + std::map VecInstElemTable; + SmallVector RemoveMIs; + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE; ) { + MachineInstr &MI = *MII; + if (optimizeVectElement(MI, &VecInstElemTable)) { + // Add MI to the list of instructions to be removed given that it has + // been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; + } + } + + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); + + return Changed; +} + +/// createAArch64VectorByElementOptPass - returns an instance of the +/// vector by element optimization pass. +FunctionPass *llvm::createAArch64VectorByElementOptPass() { + return new AArch64VectorByElementOpt(); +} Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -63,6 +63,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + AArch64VectorByElementOpt.cpp ${GLOBAL_ISEL_BUILD_FILES} ) Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -382,6 +383,10 @@ ; CHECK-LABEL: test_vfma_lane_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -394,6 +399,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -406,6 +415,10 @@ ; CHECK-LABEL: test_vfma_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -416,6 +429,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -426,6 +443,10 @@ ; CHECK-LABEL: test_vfms_lane_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -437,6 +458,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -448,6 +473,10 @@ ; CHECK-LABEL: test_vfms_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -459,6 +488,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -470,6 +503,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -482,6 +519,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -492,6 +533,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -503,6 +548,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -514,6 +563,9 @@ ; CHECK-LABEL: test_vfmas_laneq_f32 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXNOS-LABEL: test_vfmas_laneq_f32 +; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; EXNOS-NEXT: ret entry: %extract = extractelement <4 x float> %v, i32 3 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -539,6 +591,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 %extract = fsub float -0.000000e+00, %extract.rhs @@ -561,6 +616,9 @@ ; CHECK-LABEL: test_vfmsd_laneq_f64 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsd_laneq_f64 +; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 %extract = fsub double -0.000000e+00, %extract.rhs @@ -583,6 +641,9 @@ ; CHECK-LABEL: test_vfmss_lane_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmss_lane_f32_0 +; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -1408,6 +1469,10 @@ ; CHECK-LABEL: test_vmul_lane_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1418,6 +1483,9 @@ ; CHECK-LABEL: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1431,6 +1499,10 @@ ; CHECK-LABEL: test_vmulq_lane_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1441,6 +1513,10 @@ ; CHECK-LABEL: test_vmulq_lane_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1451,6 +1527,10 @@ ; CHECK-LABEL: test_vmul_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1461,6 +1541,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1474,6 +1557,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1484,6 +1571,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1494,6 +1585,10 @@ ; CHECK-LABEL: test_vmulx_lane_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1504,6 +1599,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; Exynos-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1514,6 +1613,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1524,6 +1627,10 @@ ; CHECK-LABEL: test_vmulx_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1534,6 +1641,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1544,6 +1655,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1890,6 +2005,10 @@ ; CHECK-LABEL: test_vfma_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1900,6 +2019,10 @@ ; CHECK-LABEL: test_vfmaq_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1910,6 +2033,10 @@ ; CHECK-LABEL: test_vfma_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfma_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1920,6 +2047,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1930,6 +2061,10 @@ ; CHECK-LABEL: test_vfms_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -1941,6 +2076,10 @@ ; CHECK-LABEL: test_vfmsq_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -1952,6 +2091,10 @@ ; CHECK-LABEL: test_vfms_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfms_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -1963,6 +2106,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -1974,6 +2121,10 @@ ; CHECK-LABEL: test_vfmaq_laneq_f64_0: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -1984,6 +2135,10 @@ ; CHECK-LABEL: test_vfmsq_laneq_f64_0: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2787,6 +2942,10 @@ ; CHECK-LABEL: test_vmul_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2797,6 +2956,10 @@ ; CHECK-LABEL: test_vmulq_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2807,6 +2970,10 @@ ; CHECK-LABEL: test_vmul_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2817,6 +2984,9 @@ ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmul_laneq_f64_0: +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2830,6 +3000,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2840,6 +3014,10 @@ ; CHECK-LABEL: test_vmulq_laneq_f64_0: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -2850,6 +3028,10 @@ ; CHECK-LABEL: test_vmulx_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2860,6 +3042,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2870,6 +3056,10 @@ ; CHECK-LABEL: test_vmulxq_lane_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_lane_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2880,6 +3070,10 @@ ; CHECK-LABEL: test_vmulx_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2890,6 +3084,10 @@ ; CHECK-LABEL: test_vmulxq_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2900,9 +3098,51 @@ ; CHECK-LABEL: test_vmulxq_laneq_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: +; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d +; EXYNOS-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) ret <2 x double> %vmulx2.i } +define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { +; CHECK-LABEL: optimize_dup: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret +; EXYNOS-LABEL: optimize_dup: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS-NEXT: ret +entry: + %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) + %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %1 = fmul <4 x float> %lane2, %c + %s = fsub <4 x float> %0, %1 + ret <4 x float> %s +} + +define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { +; CHECK-LABEL: no_optimize_dup: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: no_optimize_dup: +; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s +; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s +; EXYNOS-NEXT: ret +entry: + %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) + %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %1 = fmul <4 x float> %lane2, %c + %s = fsub <4 x float> %0, %1 + ret <4 x float> %s +}