Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -61,6 +61,18 @@ // %6:fpr128 = IMPLICIT_DEF // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub // +// 8. We can remove COPY from DUP scalar using DUP element because the fpr is +// shared with vector register. For example, +// +// %1:fpr16 = UADDLVv8i8v %0:fpr64 +// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16, +// %subreg.hsub %4:gpr32 = COPY %2.ssub:fpr128 +// %5:fpr128 = DUPv8i16gpr killed %4:gpr32 +// ==> +// %1:fpr16 = UADDLVv8i8v %0:fpr64 +// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16, +// %5:fpr128 = DUPv8i16lane killed %2:fpr128, 0 +// //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -127,6 +139,7 @@ bool visitINSERT(MachineInstr &MI); bool visitINSviGPR(MachineInstr &MI, unsigned Opc); bool visitINSvi64lane(MachineInstr &MI); + bool visitDUPgpr(MachineInstr &MI); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -676,6 +689,99 @@ return true; } +bool AArch64MIPeepholeOpt::visitDUPgpr(MachineInstr &MI) { + // We are expecting below cases. + // + // first case + // %1:fpr16 = UADDLVv8i8v %0:fpr64 + // %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16, + // %4:gpr32 = COPY %2.ssub:fpr128 + // %5:fpr128 = DUPv8i16gpr killed %4:gpr32 + // + // second case + // %8:fpr64 = URSHRd killed %9:fpr64, 3 + // %10:gpr64all = COPY %8:fpr64 + // %11:gpr32 = COPY %10.sub_32:gpr64all + // %12:fpr64 = DUPv8i8gpr killed %11:gpr32 + // + // We can remove COPY with DUPv8i8lane/DUPv8i16lane as below because the + // fp64/fpr128 is shared with v64/v128 as below. + // + // first case + // %1:fpr16 = UADDLVv8i8v %0:fpr64 + // %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16, + // %5:fpr128 = DUPv8i16lane killed %2:fpr128, 0 + // + // second case + // %8:fpr64 = URSHRd killed %9:fpr64, 3 + // %12:fpr64 = DUPv8i8lane killed %8:fpr64, 0 + // + // Todo: check more replacable DUP scalar types using DUP element. + // We could just need to check copy with fpr and DUP scalar. + + // Check COPY with ssub/sub_32 and fpr128/fpr64. + MachineInstr *CopyMI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); + if (CopyMI->getOpcode() != TargetOpcode::COPY) + return false; + + if (MI.getOpcode() == AArch64::DUPv8i16gpr) { + // We are expecting below case. + // + // %1:fpr16 = UADDLVv8i8v %0:fpr64 + // %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16, + // %4:gpr32 = COPY %2.ssub:fpr128 + // %5:fpr128 = DUPv8i16gpr killed + if (CopyMI->getOperand(1).getSubReg() != AArch64::ssub) + return false; + + const TargetRegisterClass *RC = + MRI->getRegClass(CopyMI->getOperand(1).getReg()); + if (RC != &AArch64::FPR128RegClass) + return false; + } + + if (MI.getOpcode() == AArch64::DUPv8i8gpr) { + // We are expecting below case. + // + // %8:fpr64 = URSHRd killed %9:fpr64, 3 + // %10:gpr64all = COPY %8:fpr64 + // %11:gpr32 = COPY %10.sub_32:gpr64all + // %12:fpr64 = DUPv8i8gpr killed %11:gpr32 + if (CopyMI->getOperand(1).getSubReg() != AArch64::sub_32) + return false; + + const TargetRegisterClass *RC = + MRI->getRegClass(CopyMI->getOperand(1).getReg()); + if (RC != &AArch64::FPR64RegClass) + CopyMI = MRI->getUniqueVRegDef(CopyMI->getOperand(1).getReg()); + + if (CopyMI->getOpcode() != TargetOpcode::COPY) + return false; + + RC = MRI->getRegClass(CopyMI->getOperand(1).getReg()); + if (RC != &AArch64::FPR64RegClass) + return false; + } + + // Create DUPv8i8lane/DUPv8i16lane. + unsigned Opc = (MI.getOpcode() == AArch64::DUPv8i16gpr) + ? AArch64::DUPv8i16lane + : AArch64::DUPv8i8lane; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = CopyMI->getOperand(1).getReg(); + MachineInstr *DUPlaneMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) + .addUse(SrcReg, getRegState(MI.getOperand(1))) + .addImm(0); + + LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *DUPlaneMI << "\n"); + (void)DUPlaneMI; + + MI.eraseFromParent(); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -754,6 +860,10 @@ case AArch64::INSvi64lane: Changed = visitINSvi64lane(MI); break; + case AArch64::DUPv8i16gpr: + case AArch64::DUPv8i8gpr: + Changed = visitDUPgpr(MI); + break; } } } Index: llvm/test/CodeGen/AArch64/replace-dupgpr-with-duplane.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/replace-dupgpr-with-duplane.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s + +define <8 x i8> @test1(<8 x i8> noundef %a) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: urshr d0, d0, #3 +; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) + %0 = and i32 %vaddlv.i, 65535 + %conv = zext i32 %0 to i64 + %vrshr_n = tail call i64 @llvm.aarch64.neon.urshl.i64(i64 %conv, i64 -3) + %conv1 = trunc i64 %vrshr_n to i8 + %vecinit.i = insertelement <8 x i8> undef, i8 %conv1, i64 0 + %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer + ret <8 x i8> %vecinit7.i +} + +declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64) + +define <8 x i8> @test2(<8 x i8> noundef %a) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv h1, v0.8b +; CHECK-NEXT: dup v0.8h, v1.h[0] +; CHECK-NEXT: rshrn v0.8b, v0.8h, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) + %0 = trunc i32 %vaddlv.i to i16 + %vecinit.i = insertelement <8 x i16> undef, i16 %0, i64 0 + %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer + %vrshrn_n2 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %vecinit7.i, i32 3) + ret <8 x i8> %vrshrn_n2 +} + +declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) +declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)