Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -68,12 +68,18 @@ FMLAv4i32_indexed_OP2, FMLSv1i32_indexed_OP2, FMLSv1i64_indexed_OP2, - FMLSv2i32_indexed_OP2, - FMLSv2i64_indexed_OP2, FMLSv2f32_OP2, + FMLSv2f32_OP1, + FMLSv2f64_OP1, FMLSv2f64_OP2, - FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv2i32_indexed_OP1, + FMLSv2i32_indexed_OP2, + FMLSv2i64_indexed_OP1, + FMLSv2i64_indexed_OP2, + FMLSv4f32_OP1, + FMLSv4f32_OP2, + FMLSv4i32_indexed_OP1, + FMLSv4i32_indexed_OP2 }; } // end namespace llvm Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3672,6 +3672,15 @@ } break; case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); @@ -3683,6 +3692,15 @@ } break; case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); @@ -3694,6 +3712,15 @@ } break; case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); @@ -3790,12 +3817,15 @@ /// \param MaddOpc the opcode fo the f|madd instruction /// \param RC Register class of operands /// \param kind of fma instruction (addressing mode) to be generated +/// \param ReplacedAddend is the result register from the instruction +/// replacing the non-combined operand, if any. static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, - FMAInstKind kind = FMAInstKind::Default) { + FMAInstKind kind = FMAInstKind::Default, + const unsigned *ReplacedAddend = nullptr) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -3805,8 +3835,17 @@ bool Src0IsKill = MUL->getOperand(1).isKill(); unsigned SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); - unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); - bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + + unsigned SrcReg2; + bool Src2IsKill; + if (ReplacedAddend) { + // If we just generated a new addend, we must be it's only use. + SrcReg2 = *ReplacedAddend; + Src2IsKill = true; + } else { + SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); + Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + } if (TargetRegisterInfo::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); @@ -4326,6 +4365,66 @@ FMAInstKind::Accumulator); } break; + case MachineCombinerPattern::FMLSv2f32_OP1: + case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { + RC = &AArch64::FPR64RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv4f32_OP1: + case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv2f64_OP1: + case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); Index: test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -0,0 +1,82 @@ +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefix=UNPROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynosm1 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s +# +name: f1_2s +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: fpr64 } +body: | + bb.0.entry: + %2:fpr64 = COPY %d2 + %1:fpr64 = COPY %d1 + %0:fpr64 = COPY %d0 + %3:fpr64 = FMULv2f32 %0, %1 + %4:fpr64 = FSUBv2f32 killed %3, %2 + %d0 = COPY %4 + RET_ReallyLR implicit %d0 + +... +# UNPROFITABLE-LABEL: name: f1_2s +# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1 +# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 +# +# PROFITABLE-LABEL: name: f1_2s +# PROFITABLE: %5:fpr64 = FNEGv2f32 %2 +# PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1 +--- +name: f1_4s +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +body: | + bb.0.entry: + %2:fpr128 = COPY %q2 + %1:fpr128 = COPY %q1 + %0:fpr128 = COPY %q0 + %3:fpr128 = FMULv4f32 %0, %1 + %4:fpr128 = FSUBv4f32 killed %3, %2 + %q0 = COPY %4 + RET_ReallyLR implicit %q0 + +... +# UNPROFITABLE-LABEL: name: f1_4s +# UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1 +# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 +# +# PROFITABLE-LABEL: name: f1_4s +# PROFITABLE: %5:fpr128 = FNEGv4f32 %2 +# PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1 +--- +name: f1_2d +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +body: | + bb.0.entry: + %2:fpr128 = COPY %q2 + %1:fpr128 = COPY %q1 + %0:fpr128 = COPY %q0 + %3:fpr128 = FMULv2f64 %0, %1 + %4:fpr128 = FSUBv2f64 killed %3, %2 + %q0 = COPY %4 + RET_ReallyLR implicit %q0 + +... +# UNPROFITABLE-LABEL: name: f1_2d +# UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1 +# UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 +# +# PROFITABLE-LABEL: name: f1_2d +# PROFITABLE: %5:fpr128 = FNEGv2f64 %2 +# PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1