Index: llvm/include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -34,6 +34,10 @@ REASSOC_XY_BCA, REASSOC_XY_BAC, + // These are patterns used to reduce the length of dependence chain. + SUBADD_OP1, + SUBADD_OP2, + // These are multiply-add patterns matched by the AArch64 machine combiner. MULADDW_OP1, MULADDW_OP2, Index: llvm/lib/CodeGen/MachineCombiner.cpp =================================================================== --- llvm/lib/CodeGen/MachineCombiner.cpp +++ llvm/lib/CodeGen/MachineCombiner.cpp @@ -92,9 +92,11 @@ bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); bool combineInstructions(MachineBasicBlock *); MachineInstr *getOperandDef(const MachineOperand &MO); - unsigned getDepth(SmallVectorImpl &InsInstrs, - DenseMap &InstrIdxForVirtReg, - MachineTraceMetrics::Trace BlockTrace); + bool isCoalescableCopy(MachineInstr *MI); + std::pair + getDepth(SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace); unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, MachineTraceMetrics::Trace BlockTrace); bool @@ -158,6 +160,43 @@ return DefInstr; } +/// Check if MI is a COPY instruction, and its src and dst registers can be +/// coalesced. +bool MachineCombiner::isCoalescableCopy(MachineInstr *MI) { + if (!MI->isCopy()) + return false; + + Register Dst = MI->getOperand(0).getReg(); + Register Src = MI->getOperand(1).getReg(); + + if (!MI->isFullCopy()) { + // If src RC contains super registers of dst RC, it can also be coalesced. + if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical()) + return false; + + auto SrcSub = MI->getOperand(1).getSubReg(); + auto SrcRC = MRI->getRegClass(Src); + auto DstRC = MRI->getRegClass(Dst); + return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr; + } + + if (Src.isPhysical() && Dst.isPhysical()) + return Src == Dst; + + if (Src.isVirtual() && Dst.isVirtual()) { + auto SrcRC = MRI->getRegClass(Src); + auto DstRC = MRI->getRegClass(Dst); + return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC); + } + + if (Src.isVirtual()) + std::swap(Src, Dst); + + // Now Src is physical register, Dst is virtual register. + auto DstRC = MRI->getRegClass(Dst); + return DstRC->contains(Src); +} + /// Computes depth of instructions in vector \InsInstr. /// /// \param InsInstrs is a vector of machine instructions @@ -165,8 +204,8 @@ /// of defining machine instruction in \p InsInstrs /// \param BlockTrace is a trace of machine instructions /// -/// \returns Depth of last instruction in \InsInstrs ("NewRoot") -unsigned +/// \returns Depth of the first and last instruction in \InsInstrs ("NewRoot") +std::pair MachineCombiner::getDepth(SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, MachineTraceMetrics::Trace BlockTrace) { @@ -204,9 +243,10 @@ MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + if (!isCoalescableCopy(DefInstr)) + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); } } IDepth = std::max(IDepth, DepthOp + LatencyOp); @@ -214,7 +254,7 @@ InstrDepth.push_back(IDepth); } unsigned NewRootIdx = InsInstrs.size() - 1; - return InstrDepth[NewRootIdx]; + return {InstrDepth[0], InstrDepth[NewRootIdx]}; } /// Computes instruction latency as max of latency of defined operands. @@ -277,6 +317,8 @@ case MachineCombinerPattern::REASSOC_XA_YB: case MachineCombinerPattern::REASSOC_XY_AMM_BMM: case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + case MachineCombinerPattern::SUBADD_OP1: + case MachineCombinerPattern::SUBADD_OP2: return CombinerObjective::MustReduceDepth; case MachineCombinerPattern::REASSOC_XY_BCA: case MachineCombinerPattern::REASSOC_XY_BAC: @@ -337,8 +379,11 @@ assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // Get depth and latency of NewRoot and Root. - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); + unsigned NewFirstDepth, NewRootDepth; + std::tie(NewFirstDepth, NewRootDepth) = + getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; + unsigned FirstDepth = BlockTrace.getInstrCycles(*DelInstrs[0]).Depth; LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " << NewRootDepth << "\tRootDepth: " << RootDepth); @@ -366,9 +411,9 @@ getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); unsigned RootSlack = BlockTrace.getInstrSlack(*Root); - unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned NewCycleCount = NewFirstDepth + NewRootLatency; unsigned OldCycleCount = - RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); + FirstDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " << RootSlack << " SlackIsAccurate=" << SlackIsAccurate @@ -381,7 +426,7 @@ LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount << ", OldCycleCount = " << OldCycleCount << "\n"); - return NewCycleCount <= OldCycleCount; + return NewCycleCount < OldCycleCount; } /// helper routine to convert instructions into SC Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4827,6 +4827,10 @@ return false; } + if (isCombineInstrSettingFlag(CombineOpc) && + MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return false; + return true; } @@ -5323,6 +5327,42 @@ } // end switch (Pattern) return false; } + +/// Find other MI combine patterns. +static bool getMiscPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) +{ + // A - (B + C) ==> (A - B) - C or (A - C) - B + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + + switch (Opc) { + case AArch64::SUBWrr: + case AArch64::SUBSWrr: + case AArch64::SUBXrr: + case AArch64::SUBSXrr: + // Found candidate root. + break; + default: + return false; + } + + if (isCombineInstrSettingFlag(Opc) && + Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return false; + + if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { + Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); + Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); + return true; + } + + return false; +} + /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -5340,6 +5380,10 @@ if (getFMAPatterns(Root, Patterns)) return true; + // Other patterns + if (getMiscPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -5587,6 +5631,53 @@ return MUL; } +/// Do the following transformation +/// A - (B + C) ==> (A - B) - C +/// A - (B + C) ==> (A - C) - B +static void +genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + unsigned IdxOpd1, + DenseMap &InstrIdxForVirtReg) { + assert(IdxOpd1 == 1 || IdxOpd1 == 2); + unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; + MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + + Register ResultReg = Root.getOperand(0).getReg(); + Register RegA = Root.getOperand(1).getReg(); + bool RegAIsKill = Root.getOperand(1).isKill(); + Register RegB = AddMI->getOperand(IdxOpd1).getReg(); + bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); + Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); + bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); + Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); + + unsigned Opcode = Root.getOpcode(); + if (Opcode == AArch64::SUBSWrr) + Opcode = AArch64::SUBWrr; + else if (Opcode == AArch64::SUBSXrr) + Opcode = AArch64::SUBXrr; + else + assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && + "Unexpected instruction opcode."); + + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), NewVR) + .addReg(RegA, getKillRegState(RegAIsKill)) + .addReg(RegB, getKillRegState(RegBIsKill)); + MachineInstrBuilder MIB2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), ResultReg) + .addReg(NewVR, getKillRegState(true)) + .addReg(RegC, getKillRegState(RegCIsKill)); + + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + InsInstrs.push_back(MIB1); + InsInstrs.push_back(MIB2); + DelInstrs.push_back(AddMI); +} + /// When getMachineCombinerPatterns() finds potential patterns, /// this function generates the instructions that could replace the /// original code sequence @@ -5609,6 +5700,18 @@ TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); return; + case MachineCombinerPattern::SUBADD_OP1: + // A - (B + C) + // ==> (A - B) - C + genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, + InstrIdxForVirtReg); + break; + case MachineCombinerPattern::SUBADD_OP2: + // A - (B + C) + // ==> (A - C) - B + genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, + InstrIdxForVirtReg); + break; case MachineCombinerPattern::MULADDW_OP1: case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 Index: llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -1,8 +1,8 @@ -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=EXYNOS %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=THUNDER2 %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=THUNDER3 %s # name: f1_2s registers: @@ -26,9 +26,21 @@ # UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1 # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 # +# THUNDER2-LABEL: name: f1_2s +# THUNDER2: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# THUNDER2-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_2s +# THUNDER3: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# THUNDER3-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_2s # PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 # PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_2s +# EXYNOS: %3:fpr64 = FMULv2f32 %0, %1 +# EXYNOS-NEXT: FSUBv2f32 killed %3, % --- name: f1_4s registers: @@ -52,9 +64,21 @@ # UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1 # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 # +# THUNDER2-LABEL: name: f1_4s +# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# THUNDER2-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_4s +# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# THUNDER3-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_4s # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 # PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_4s +# EXYNOS: %3:fpr128 = FMULv4f32 %0, %1 +# EXYNOS-NEXT: FSUBv4f32 killed %3, %2 --- name: f1_2d registers: @@ -78,9 +102,21 @@ # UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 # +# THUNDER2-LABEL: name: f1_2d +# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# THUNDER2-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_2d +# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# THUNDER3-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_2d # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 # PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_2d +# EXYNOS: %3:fpr128 = FMULv2f64 %0, %1 +# EXYNOS-NEXT: FSUBv2f64 killed %3, %2 --- name: f1_both_fmul_2s registers: @@ -104,9 +140,29 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_2s -# ALL: %4:fpr64 = FMULv2f32 %0, %1 -# ALL-NEXT: FMLSv2f32 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_2s +# UNPROFITABLE: %4:fpr64 = FMULv2f32 %0, %1 +# UNPROFITABLE-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# UNPROFITABLE-NEXT: FSUBv2f32 killed %4, %5 +# +# THUNDER2-LABEL: name: f1_both_fmul_2s +# THUNDER2: %4:fpr64 = FMULv2f32 %0, %1 +# THUNDER2-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# THUNDER2-NEXT: FSUBv2f32 killed %4, %5 +# +# THUNDER3-LABEL: name: f1_both_fmul_2s +# THUNDER3: %4:fpr64 = FMULv2f32 %0, %1 +# THUNDER3-NEXT: %6:fpr64 = FMLSv2f32 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_2s +# PROFITABLE: %4:fpr64 = FMULv2f32 %0, %1 +# PROFITABLE-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# PROFITABLE-NEXT: FSUBv2f32 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_2s +# EXYNOS: %4:fpr64 = FMULv2f32 %0, %1 +# EXYNOS-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# EXYNOS-NEXT: FSUBv2f32 killed %4, %5 --- name: f1_both_fmul_4s registers: @@ -130,9 +186,29 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_4s -# ALL: %4:fpr128 = FMULv4f32 %0, %1 -# ALL-NEXT: FMLSv4f32 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_4s +# UNPROFITABLE: %4:fpr128 = FMULv4f32 %0, %1 +# UNPROFITABLE-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# UNPROFITABLE-NEXT: FSUBv4f32 killed %4, %5 +# +# THUNDER2-LABEL: name: f1_both_fmul_4s +# THUNDER2: %4:fpr128 = FMULv4f32 %0, %1 +# THUNDER2-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# THUNDER2-NEXT: FSUBv4f32 killed %4, %5 +# +# THUNDER3-LABEL: name: f1_both_fmul_4s +# THUNDER3: %4:fpr128 = FMULv4f32 %0, %1 +# THUNDER3-NEXT: %6:fpr128 = FMLSv4f32 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_4s +# PROFITABLE: %4:fpr128 = FMULv4f32 %0, %1 +# PROFITABLE-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# PROFITABLE-NEXT: FSUBv4f32 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_4s +# EXYNOS: %4:fpr128 = FMULv4f32 %0, %1 +# EXYNOS-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# EXYNOS-NEXT: FSUBv4f32 killed %4, %5 --- name: f1_both_fmul_2d registers: @@ -156,7 +232,27 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_2d -# ALL: %4:fpr128 = FMULv2f64 %0, %1 -# ALL-NEXT: FMLSv2f64 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_2d +# UNPROFITABLE: %4:fpr128 = FMULv2f64 %0, %1 +# UNPROFITABLE-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# UNPROFITABLE-NEXT: FSUBv2f64 killed %4, %5 +# +# THUNDER2-LABEL: name: f1_both_fmul_2d +# THUNDER2: %4:fpr128 = FMULv2f64 %0, %1 +# THUNDER2-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# THUNDER2-NEXT: FSUBv2f64 killed %4, %5 +# +# THUNDER3-LABEL: name: f1_both_fmul_2d +# THUNDER3: %4:fpr128 = FMULv2f64 %0, %1 +# THUNDER3-NEXT: %6:fpr128 = FMLSv2f64 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_2d +# PROFITABLE: %4:fpr128 = FMULv2f64 %0, %1 +# PROFITABLE-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# PROFITABLE-NEXT: FSUBv2f64 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_2d +# EXYNOS: %4:fpr128 = FMULv2f64 %0, %1 +# EXYNOS-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# EXYNOS-NEXT: FSUBv2f64 killed %4, %5 Index: llvm/test/CodeGen/AArch64/addsub_ext.ll =================================================================== --- llvm/test/CodeGen/AArch64/addsub_ext.ll +++ llvm/test/CodeGen/AArch64/addsub_ext.ll @@ -570,8 +570,9 @@ define dso_local i64 @madd_fold_uxtw(i32 %x, i64 %y) { ; CHECK-LABEL: madd_fold_uxtw: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w0, #0x3 -; CHECK-NEXT: madd x0, x1, x1, x8 +; CHECK-NEXT: mul x8, x1, x1 +; CHECK-NEXT: and w9, w0, #0x3 +; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret entry: %m = and i32 %x, 3 Index: llvm/test/CodeGen/AArch64/arm64-fma-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -3,7 +3,8 @@ define void @foo_2d(double* %src) { ; CHECK-LABEL: %entry ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: fadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} entry: %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 Index: llvm/test/CodeGen/AArch64/fadd-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/fadd-combines.ll +++ llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -230,7 +230,8 @@ ; CHECK-LABEL: fadd_fma_fmul_2: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul s2, s2, s3 -; CHECK-NEXT: fmadd s0, s0, s1, s2 +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s0, s4, s0 ; CHECK-NEXT: ret %m1 = fmul float %a, %b Index: llvm/test/CodeGen/AArch64/i128-math.ll =================================================================== --- llvm/test/CodeGen/AArch64/i128-math.ll +++ llvm/test/CodeGen/AArch64/i128-math.ll @@ -307,9 +307,10 @@ ; CHECK-LABEL: u128_mul: ; CHECK: // %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret %1 = mul i128 %x, %y ret i128 %1 @@ -405,9 +406,10 @@ ; CHECK-LABEL: i128_mul: ; CHECK: // %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret %1 = mul i128 %x, %y ret i128 %1 Index: llvm/test/CodeGen/AArch64/machine-combiner-madd.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-combiner-madd.ll +++ llvm/test/CodeGen/AArch64/machine-combiner-madd.ll @@ -10,13 +10,12 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx3t110 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=tsv110 < %s | FileCheck %s -; Make sure that inst-combine fuses the multiply add in the addressing mode of -; the load. +; Make sure that machine combiner doesn't fuse the multiply add because the +; latency of max(mul, load)+add is shorter than load+madd. ; CHECK-LABEL: fun: -; CHECK-NOT: mul -; CHECK: madd -; CHECK-NOT: mul +; CHECK: mul +; CHECK-NOT: madd %class.D = type { %class.basic_string.base, [4 x i8] } %class.basic_string.base = type <{ i64, i64, i32 }> Index: llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s + +; The test cases in this file check following transformation if the right form +; can reduce latency. +; A - (B + C) ==> (A - B) - C + +; 32 bit version. +define i32 @test1(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w8, w2, w0 +; CHECK-NEXT: eor w9, w1, w0, lsl #8 +; CHECK-NEXT: sub w8, w8, w9 +; CHECK-NEXT: eor w0, w8, w9, asr #13 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 8 + %xor = xor i32 %shl, %b + %add = add i32 %xor, %a + %sub = sub i32 %c, %add + %shr = ashr i32 %xor, 13 + %xor2 = xor i32 %sub, %shr + ret i32 %xor2 +} + +; 64 bit version. +define i64 @test2(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x8, x2, x0 +; CHECK-NEXT: eor x9, x1, x0, lsl #8 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: eor x0, x8, x9, asr #13 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 8 + %xor = xor i64 %shl, %b + %add = add i64 %xor, %a + %sub = sub i64 %c, %add + %shr = ashr i64 %xor, 13 + %xor2 = xor i64 %sub, %shr + ret i64 %xor2 +} + +; Negative test. The right form can't reduce latency. +define i32 @test3(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w2, w0 +; CHECK-NEXT: eor w9, w1, w0, lsl #8 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: eor w0, w8, w9, asr #13 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 8 + %xor = xor i32 %shl, %b + %add = add i32 %c, %a + %sub = sub i32 %xor, %add + %shr = ashr i32 %xor, 13 + %xor2 = xor i32 %sub, %shr + ret i32 %xor2 +} + Index: llvm/test/CodeGen/AArch64/madd-combiner.ll =================================================================== --- llvm/test/CodeGen/AArch64/madd-combiner.ll +++ llvm/test/CodeGen/AArch64/madd-combiner.ll @@ -6,8 +6,8 @@ define i32 @mul_add_imm(i32 %a, i32 %b) { ; CHECK-LABEL: mul_add_imm: ; CHECK: ; %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x4 -; CHECK-NEXT: madd w0, w0, w1, w8 +; CHECK-NEXT: mul w8, w0, w1 +; CHECK-NEXT: add w0, w8, #4 ; CHECK-NEXT: ret %1 = mul i32 %a, %b %2 = add i32 %1, 4 Index: llvm/test/CodeGen/AArch64/madd-lohi.ll =================================================================== --- llvm/test/CodeGen/AArch64/madd-lohi.ll +++ llvm/test/CodeGen/AArch64/madd-lohi.ll @@ -6,17 +6,19 @@ ; CHECK-LABEL: test_128bitmul: ; CHECK: ; %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_128bitmul: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: umulh x8, x1, x3 +; CHECK-BE-NEXT: mul x9, x0, x3 ; CHECK-BE-NEXT: madd x8, x1, x2, x8 ; CHECK-BE-NEXT: mul x1, x1, x3 -; CHECK-BE-NEXT: madd x0, x0, x3, x8 +; CHECK-BE-NEXT: add x0, x8, x9 ; CHECK-BE-NEXT: ret Index: llvm/test/CodeGen/AArch64/mul-lohi.ll =================================================================== --- llvm/test/CodeGen/AArch64/mul-lohi.ll +++ llvm/test/CodeGen/AArch64/mul-lohi.ll @@ -3,16 +3,20 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: test_128bitmul: +; CHECK: mul [[TEMP0:x[0-9]+]], x0, x3 ; CHECK: umulh [[HI:x[0-9]+]], x0, x2 -; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] -; CHECK-DAG: madd x1, x1, x2, [[TEMP1]] +; CHECK: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]] +; CHECK: mul [[TEMP2:x[0-9]+]], x1, x2 +; CHECK-DAG: add x1, [[TEMP1]], [[TEMP2]] ; CHECK-DAG: mul x0, x0, x2 ; CHECK-NEXT: ret ; CHECK-BE-LABEL: test_128bitmul: +; CHECK-BE: mul [[TEMP0:x[0-9]+]], x1, x2 ; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3 -; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]] -; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]] +; CHECK-BE: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]] +; CHECK-BE: mul [[TEMP2:x[0-9]+]], x0, x3 +; CHECK-BE-DAG: add x0, [[TEMP1]], [[TEMP2]] ; CHECK-BE-DAG: mul x1, x1, x3 ; CHECK-BE-NEXT: ret Index: llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -354,14 +354,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] ; CHECK-NEXT: adrp x8, .LCPI13_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s @@ -383,14 +384,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: adrp x8, .LCPI14_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] ; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s @@ -412,14 +414,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: adrp x8, .LCPI15_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] ; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s