diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -38,6 +38,33 @@ MULSUBX_OP2, MULADDXI_OP1, MULSUBXI_OP1, + // NEON integers vectors + VMULADDv8i8_OP1, + VMULADDv8i8_OP2, + VMULADDv16i8_OP1, + VMULADDv16i8_OP2, + VMULADDv4i16_OP1, + VMULADDv4i16_OP2, + VMULADDv8i16_OP1, + VMULADDv8i16_OP2, + VMULADDv2i32_OP1, + VMULADDv2i32_OP2, + VMULADDv4i32_OP1, + VMULADDv4i32_OP2, + + VMULSUBv8i8_OP1, + VMULSUBv8i8_OP2, + VMULSUBv16i8_OP1, + VMULSUBv16i8_OP2, + VMULSUBv4i16_OP1, + VMULSUBv4i16_OP2, + VMULSUBv8i16_OP1, + VMULSUBv8i16_OP2, + VMULSUBv2i32_OP1, + VMULSUBv2i32_OP2, + VMULSUBv4i32_OP1, + VMULSUBv4i32_OP2, + // Floating Point FMULADDH_OP1, FMULADDH_OP2, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3571,6 +3571,18 @@ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. case AArch64::SUBXri: case AArch64::SUBSXri: + case AArch64::ADDv8i8: + case AArch64::ADDv16i8: + case AArch64::ADDv4i16: + case AArch64::ADDv8i16: + case AArch64::ADDv2i32: + case AArch64::ADDv4i32: + case AArch64::SUBv8i8: + case AArch64::SUBv16i8: + case AArch64::SUBv4i16: + case AArch64::SUBv8i16: + case AArch64::SUBv2i32: + case AArch64::SUBv4i32: return true; default: break; @@ -3713,6 +3725,13 @@ } }; + auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { + if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { + Patterns.push_back(Pattern); + Found = true; + } + }; + typedef MachineCombinerPattern MCP; switch (Opc) { @@ -3748,6 +3767,54 @@ case AArch64::SUBXri: setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); break; + case AArch64::ADDv8i8: + setVFound(AArch64::MULv8i8, 1, MCP::VMULADDv8i8_OP1); + setVFound(AArch64::MULv8i8, 2, MCP::VMULADDv8i8_OP2); + break; + case AArch64::ADDv16i8: + setVFound(AArch64::MULv16i8, 1, MCP::VMULADDv16i8_OP1); + setVFound(AArch64::MULv16i8, 2, MCP::VMULADDv16i8_OP2); + break; + case AArch64::ADDv4i16: + setVFound(AArch64::MULv4i16, 1, MCP::VMULADDv4i16_OP1); + setVFound(AArch64::MULv4i16, 2, MCP::VMULADDv4i16_OP2); + break; + case AArch64::ADDv8i16: + setVFound(AArch64::MULv8i16, 1, MCP::VMULADDv8i16_OP1); + setVFound(AArch64::MULv8i16, 2, MCP::VMULADDv8i16_OP2); + break; + case AArch64::ADDv2i32: + setVFound(AArch64::MULv2i32, 1, MCP::VMULADDv2i32_OP1); + setVFound(AArch64::MULv2i32, 2, MCP::VMULADDv2i32_OP2); + break; + case AArch64::ADDv4i32: + setVFound(AArch64::MULv4i32, 1, MCP::VMULADDv4i32_OP1); + setVFound(AArch64::MULv4i32, 2, MCP::VMULADDv4i32_OP2); + break; + case AArch64::SUBv8i8: + setVFound(AArch64::MULv8i8, 1, MCP::VMULSUBv8i8_OP1); + setVFound(AArch64::MULv8i8, 2, MCP::VMULSUBv8i8_OP2); + break; + case AArch64::SUBv16i8: + setVFound(AArch64::MULv16i8, 1, MCP::VMULSUBv16i8_OP1); + setVFound(AArch64::MULv16i8, 2, MCP::VMULSUBv16i8_OP2); + break; + case AArch64::SUBv4i16: + setVFound(AArch64::MULv4i16, 1, MCP::VMULSUBv4i16_OP1); + setVFound(AArch64::MULv4i16, 2, MCP::VMULSUBv4i16_OP2); + break; + case AArch64::SUBv8i16: + setVFound(AArch64::MULv8i16, 1, MCP::VMULSUBv8i16_OP1); + setVFound(AArch64::MULv8i16, 2, MCP::VMULSUBv8i16_OP2); + break; + case AArch64::SUBv2i32: + setVFound(AArch64::MULv2i32, 1, MCP::VMULSUBv2i32_OP1); + setVFound(AArch64::MULv2i32, 2, MCP::VMULSUBv2i32_OP2); + break; + case AArch64::SUBv4i32: + setVFound(AArch64::MULv4i32, 1, MCP::VMULSUBv4i32_OP1); + setVFound(AArch64::MULv4i32, 2, MCP::VMULSUBv4i32_OP2); + break; } return Found; } @@ -4063,6 +4130,19 @@ return MUL; } +/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate +/// instructions. +/// +/// \see genFusedMultiply +static MachineInstr *genFusedMultiplyAcc( + MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + MachineInstr &Root, SmallVectorImpl &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, + const Register *ReplacedAddend = nullptr) { + return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, + FMAInstKind::Accumulator, ReplacedAddend); +} + /// genMaddR - Generate madd instruction and combine mul and add using /// an extra virtual register /// Example - an ADD intermediate needs to be stored in a register: @@ -4302,6 +4382,129 @@ } break; } + + case MachineCombinerPattern::VMULADDv8i8_OP1: + Opc = AArch64::MLAv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv8i8_OP2: + Opc = AArch64::MLAv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv16i8_OP1: + Opc = AArch64::MLAv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv16i8_OP2: + Opc = AArch64::MLAv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv4i16_OP1: + Opc = AArch64::MLAv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv4i16_OP2: + Opc = AArch64::MLAv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv8i16_OP1: + Opc = AArch64::MLAv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv8i16_OP2: + Opc = AArch64::MLAv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv2i32_OP1: + Opc = AArch64::MLAv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv2i32_OP2: + Opc = AArch64::MLAv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv4i32_OP1: + Opc = AArch64::MLAv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULADDv4i32_OP2: + Opc = AArch64::MLAv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::VMULSUBv8i8_OP1: + Opc = AArch64::MLSv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv8i8_OP2: + Opc = AArch64::MLSv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv16i8_OP1: + Opc = AArch64::MLSv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv16i8_OP2: + Opc = AArch64::MLSv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv4i16_OP1: + Opc = AArch64::MLSv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv4i16_OP2: + Opc = AArch64::MLSv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv8i16_OP1: + Opc = AArch64::MLSv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv8i16_OP2: + Opc = AArch64::MLSv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv2i32_OP1: + Opc = AArch64::MLSv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv2i32_OP2: + Opc = AArch64::MLSv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv4i32_OP1: + Opc = AArch64::MLSv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::VMULSUBv4i32_OP2: + Opc = AArch64::MLSv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + // Floating Point Support case MachineCombinerPattern::FMULADDH_OP1: Opc = AArch64::FMADDHrrr; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3793,10 +3793,8 @@ defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; -defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; -defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; +defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; // Generated in MachineCombine +defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; // Generated in MachineCombine defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir @@ -1433,8 +1433,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLAv8i8_:%[0-9]+]]:fpr64 = MLAv8i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i8_]] + ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv8i8_:%[0-9]+]]:fpr64 = ADDv8i8 [[MULv8i8_]], [[COPY2]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i8_]] %4:fpr(<8 x s8>) = COPY $d2 %3:fpr(<8 x s8>) = COPY $d1 %2:fpr(<8 x s8>) = COPY $d0 @@ -1468,8 +1469,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLAv16i8_:%[0-9]+]]:fpr128 = MLAv16i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv16i8_]] + ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv16i8_:%[0-9]+]]:fpr128 = ADDv16i8 [[MULv16i8_]], [[COPY2]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv16i8_]] %4:fpr(<16 x s8>) = COPY $q2 %3:fpr(<16 x s8>) = COPY $q1 %2:fpr(<16 x s8>) = COPY $q0 @@ -1503,8 +1505,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLAv4i16_:%[0-9]+]]:fpr64 = MLAv4i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv4i16_]] + ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv4i16_:%[0-9]+]]:fpr64 = ADDv4i16 [[MULv4i16_]], [[COPY2]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv4i16_]] %4:fpr(<4 x s16>) = COPY $d2 %3:fpr(<4 x s16>) = COPY $d1 %2:fpr(<4 x s16>) = COPY $d0 @@ -1538,8 +1541,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLAv8i16_:%[0-9]+]]:fpr128 = MLAv8i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i16_]] + ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv8i16_:%[0-9]+]]:fpr128 = ADDv8i16 [[MULv8i16_]], [[COPY2]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i16_]] %4:fpr(<8 x s16>) = COPY $q2 %3:fpr(<8 x s16>) = COPY $q1 %2:fpr(<8 x s16>) = COPY $q0 @@ -1759,8 +1763,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLAv8i8_:%[0-9]+]]:fpr64 = MLAv8i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i8_]] + ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv8i8_:%[0-9]+]]:fpr64 = ADDv8i8 [[COPY2]], [[MULv8i8_]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i8_]] %4:fpr(<8 x s8>) = COPY $d2 %3:fpr(<8 x s8>) = COPY $d1 %2:fpr(<8 x s8>) = COPY $d0 @@ -1794,8 +1799,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLAv16i8_:%[0-9]+]]:fpr128 = MLAv16i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv16i8_]] + ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv16i8_:%[0-9]+]]:fpr128 = ADDv16i8 [[COPY2]], [[MULv16i8_]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv16i8_]] %4:fpr(<16 x s8>) = COPY $q2 %3:fpr(<16 x s8>) = COPY $q1 %2:fpr(<16 x s8>) = COPY $q0 @@ -1829,8 +1835,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLAv4i16_:%[0-9]+]]:fpr64 = MLAv4i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv4i16_]] + ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv4i16_:%[0-9]+]]:fpr64 = ADDv4i16 [[COPY2]], [[MULv4i16_]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv4i16_]] %4:fpr(<4 x s16>) = COPY $d2 %3:fpr(<4 x s16>) = COPY $d1 %2:fpr(<4 x s16>) = COPY $d0 @@ -1864,8 +1871,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLAv8i16_:%[0-9]+]]:fpr128 = MLAv8i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i16_]] + ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]] + ; CHECK: [[ADDv8i16_:%[0-9]+]]:fpr128 = ADDv8i16 [[COPY2]], [[MULv8i16_]] + ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i16_]] %4:fpr(<8 x s16>) = COPY $q2 %3:fpr(<8 x s16>) = COPY $q1 %2:fpr(<8 x s16>) = COPY $q0 @@ -2085,8 +2093,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLSv8i8_:%[0-9]+]]:fpr64 = MLSv8i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLSv8i8_]] + ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]] + ; CHECK: [[SUBv8i8_:%[0-9]+]]:fpr64 = SUBv8i8 [[COPY2]], [[MULv8i8_]] + ; CHECK: $noreg = PATCHABLE_RET [[SUBv8i8_]] %4:fpr(<8 x s8>) = COPY $d2 %3:fpr(<8 x s8>) = COPY $d1 %2:fpr(<8 x s8>) = COPY $d0 @@ -2120,8 +2129,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLSv16i8_:%[0-9]+]]:fpr128 = MLSv16i8 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLSv16i8_]] + ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]] + ; CHECK: [[SUBv16i8_:%[0-9]+]]:fpr128 = SUBv16i8 [[COPY2]], [[MULv16i8_]] + ; CHECK: $noreg = PATCHABLE_RET [[SUBv16i8_]] %4:fpr(<16 x s8>) = COPY $q2 %3:fpr(<16 x s8>) = COPY $q1 %2:fpr(<16 x s8>) = COPY $q0 @@ -2155,8 +2165,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[MLSv4i16_:%[0-9]+]]:fpr64 = MLSv4i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLSv4i16_]] + ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]] + ; CHECK: [[SUBv4i16_:%[0-9]+]]:fpr64 = SUBv4i16 [[COPY2]], [[MULv4i16_]] + ; CHECK: $noreg = PATCHABLE_RET [[SUBv4i16_]] %4:fpr(<4 x s16>) = COPY $d2 %3:fpr(<4 x s16>) = COPY $d1 %2:fpr(<4 x s16>) = COPY $d0 @@ -2190,8 +2201,9 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2 ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[MLSv8i16_:%[0-9]+]]:fpr128 = MLSv8i16 [[COPY2]], [[COPY1]], [[COPY]] - ; CHECK: $noreg = PATCHABLE_RET [[MLSv8i16_]] + ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]] + ; CHECK: [[SUBv8i16_:%[0-9]+]]:fpr128 = SUBv8i16 [[COPY2]], [[MULv8i16_]] + ; CHECK: $noreg = PATCHABLE_RET [[SUBv8i16_]] %4:fpr(<8 x s16>) = COPY $q2 %3:fpr(<8 x s16>) = COPY $q1 %2:fpr(<8 x s16>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll --- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll +++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll @@ -5,17 +5,17 @@ ; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x1, #32] -; CHECK-NEXT: ldr q1, [x0, #32] -; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: ldr q1, [x1, #96] +; CHECK-NEXT: ldr q2, [x0, #32] ; CHECK-NEXT: ldr q3, [x0, #96] ; CHECK-NEXT: ldr x8, [x2, #48] -; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mla v1.8h, v3.8h, v2.8h ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: str q1, [x8, x9] +; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h +; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h +; CHECK-NEXT: add v2.8h, v0.8h, v1.8h +; CHECK-NEXT: str q2, [x8, x9] ; CHECK-NEXT: ldr x8, [x2, #56] -; CHECK-NEXT: mls v0.8h, v3.8h, v2.8h +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: str q0, [x8, x9] ; CHECK-NEXT: ret entry: