This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64MIPeepholeOpt.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
replace-dupgpr-with-duplane.ll

Differential D148134

[AArch64] Replace DUP scalar by DUP element
AbandonedPublic

Authored by jaykang10 on Apr 12 2023, 8:12 AM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma
t.p.northover

Summary

gcc generates less instructions than llvm from below intrinsic example.

#include <arm_neon.h>

uint8x8_t test1(uint8x8_t a) {
    return vdup_n_u8(vrshrd_n_u64(vaddlv_u8(a), 3));
}

uint8x8_t test2(uint8x8_t a) {
    return vrshrn_n_u16(vdupq_n_u16(vaddlv_u8(a)), 3); 
}

gcc output
test1:
	uaddlv	h0, v0.8b
	umov	w0, v0.h[0]
	fmov	d0, x0
	urshr	d0, d0, 3
	dup	v0.8b, v0.b[0]
	ret

test2:
	uaddlv	h0, v0.8b
	dup	v0.8h, v0.h[0]
	rshrn	v0.8b, v0.8h, 3
	ret

llvm output
test1:                                  // @test1
	uaddlv	h0, v0.8b
	fmov	w8, s0
	and	w8, w8, #0xffff
	fmov	d0, x8
	urshr	d0, d0, #3
	fmov	x8, d0
	dup	v0.8b, w8
	ret

test2:                                  // @test2
	uaddlv	h0, v0.8b
	fmov	w8, s0
	dup	v0.8h, w8
	rshrn	v0.8b, v0.8h, #3
	ret

We can see additional fmov instructions on llvm output.
The uddlv has FPR as out register class and the dup has GPR as source register class. Therefore, there is COPY instruction for register class conversions between FPR and GPR and it is expanded to fmov.
There is dup instruction with simd register which is called dup element. If we use it, we can remove the COPY instruction because the FPR is shared with simd register.
With this patch, llvm generates below output.

test1:                                  // @test1
	uaddlv	h0, v0.8b
	fmov	w8, s0
	and	w8, w8, #0xffff
	fmov	d0, x8
	urshr	d0, d0, #3
	dup	v0.8b, v0.b[0]
	ret

test2:                                  // @test2
	uaddlv	h1, v0.8b
	dup	v0.8h, v1.h[0]
	rshrn	v0.8b, v0.8h, #3
	ret

Diff Detail

Event Timeline

jaykang10 created this revision.Apr 12 2023, 8:12 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 12 2023, 8:12 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.Apr 12 2023, 8:12 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 12 2023, 8:12 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

jaykang10 added reviewers: dmgreen, efriedma, t.p.northover.Apr 12 2023, 8:14 AM

Harbormaster completed remote builds in B225078: Diff 512834.Apr 12 2023, 9:01 AM

jaykang10 mentioned this in D148234: [AArch64] Remove AND and FMOV between uaddlv an urshl.Apr 13 2023, 7:55 AM

For the i64 neon intrinsics it would be possible to change the representation in DAG combine so that they passed and used a v1i64 node, which might help that case simplify in SDAG. Maybe something similar could be done for UADDLV too.

This way has other advantages with it being shared between SDAG and GlobalISel. And might come up in other cases, like across basic-block boundaries. Can we extend it to all the DUP sizes?

In D148134#4265364, @dmgreen wrote:

For the i64 neon intrinsics it would be possible to change the representation in DAG combine so that they passed and used a v1i64 node, which might help that case simplify in SDAG. Maybe something similar could be done for UADDLV too.

This way has other advantages with it being shared between SDAG and GlobalISel. And might come up in other cases, like across basic-block boundaries. Can we extend it to all the DUP sizes?

I agree with you.
It could be better to solve this issue on SelectionDAG level.
Let me close this patch.

OK, this might have uses in general. Like I said MIPeephole can have advantages. We can reinstate it if we find a good use, or get stuck doing it the other way.

In D148134#4265983, @dmgreen wrote:

OK, this might have uses in general. Like I said MIPeephole can have advantages. We can reinstate it if we find a good use, or get stuck doing it the other way.

Yep, I think the type cast with v1i64 would be better to fix this issue.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64MIPeepholeOpt.cpp

110 lines

test/

CodeGen/

AArch64/

replace-dupgpr-with-duplane.ll

44 lines

Diff 512834

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines
// %6:fpr128 = IMPLICIT_DEF		// %6:fpr128 = IMPLICIT_DEF
// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub		// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0		// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
// ==>		// ==>
// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr		// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
// %6:fpr128 = IMPLICIT_DEF		// %6:fpr128 = IMPLICIT_DEF
// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub		// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
//		//
		// 8. We can remove COPY from DUP scalar using DUP element because the fpr is
		// shared with vector register. For example,
		//
		// %1:fpr16 = UADDLVv8i8v %0:fpr64
		// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16,
		// %subreg.hsub %4:gpr32 = COPY %2.ssub:fpr128
		// %5:fpr128 = DUPv8i16gpr killed %4:gpr32
		// ==>
		// %1:fpr16 = UADDLVv8i8v %0:fpr64
		// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16,
		// %5:fpr128 = DUPv8i16lane killed %2:fpr128, 0
		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AArch64ExpandImm.h"		#include "AArch64ExpandImm.h"
#include "AArch64InstrInfo.h"		#include "AArch64InstrInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"		#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineDominators.h"		#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"		#include "llvm/CodeGen/MachineLoopInfo.h"

▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);		bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);

template <typename T>		template <typename T>
bool visitAND(unsigned Opc, MachineInstr &MI);		bool visitAND(unsigned Opc, MachineInstr &MI);
bool visitORR(MachineInstr &MI);		bool visitORR(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);		bool visitINSERT(MachineInstr &MI);
bool visitINSviGPR(MachineInstr &MI, unsigned Opc);		bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
bool visitINSvi64lane(MachineInstr &MI);		bool visitINSvi64lane(MachineInstr &MI);
		bool visitDUPgpr(MachineInstr &MI);
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
return "AArch64 MI Peephole Optimization pass";		return "AArch64 MI Peephole Optimization pass";
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();		AU.setPreservesCFG();
▲ Show 20 Lines • Show All 533 Lines • ▼ Show 20 Lines	bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
Register OldDef = MI.getOperand(0).getReg();		Register OldDef = MI.getOperand(0).getReg();
Register NewDef = MI.getOperand(1).getReg();		Register NewDef = MI.getOperand(1).getReg();
MRI->replaceRegWith(OldDef, NewDef);		MRI->replaceRegWith(OldDef, NewDef);
MI.eraseFromParent();		MI.eraseFromParent();

return true;		return true;
}		}

		bool AArch64MIPeepholeOpt::visitDUPgpr(MachineInstr &MI) {
		// We are expecting below cases.
		//
		// first case
		// %1:fpr16 = UADDLVv8i8v %0:fpr64
		// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16,
		// %4:gpr32 = COPY %2.ssub:fpr128
		// %5:fpr128 = DUPv8i16gpr killed %4:gpr32
		//
		// second case
		// %8:fpr64 = URSHRd killed %9:fpr64, 3
		// %10:gpr64all = COPY %8:fpr64
		// %11:gpr32 = COPY %10.sub_32:gpr64all
		// %12:fpr64 = DUPv8i8gpr killed %11:gpr32
		//
		// We can remove COPY with DUPv8i8lane/DUPv8i16lane as below because the
		// fp64/fpr128 is shared with v64/v128 as below.
		//
		// first case
		// %1:fpr16 = UADDLVv8i8v %0:fpr64
		// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16,
		// %5:fpr128 = DUPv8i16lane killed %2:fpr128, 0
		//
		// second case
		// %8:fpr64 = URSHRd killed %9:fpr64, 3
		// %12:fpr64 = DUPv8i8lane killed %8:fpr64, 0
		//
		// Todo: check more replacable DUP scalar types using DUP element.
		// We could just need to check copy with fpr and DUP scalar.

		// Check COPY with ssub/sub_32 and fpr128/fpr64.
		MachineInstr *CopyMI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
		if (CopyMI->getOpcode() != TargetOpcode::COPY)
		return false;

		if (MI.getOpcode() == AArch64::DUPv8i16gpr) {
		// We are expecting below case.
		//
		// %1:fpr16 = UADDLVv8i8v %0:fpr64
		// %2:fpr128 = INSERT_SUBREG %3:fpr128(tied-def 0), killed %1:fpr16,
		// %4:gpr32 = COPY %2.ssub:fpr128
		// %5:fpr128 = DUPv8i16gpr killed
		if (CopyMI->getOperand(1).getSubReg() != AArch64::ssub)
		return false;

		const TargetRegisterClass *RC =
		MRI->getRegClass(CopyMI->getOperand(1).getReg());
		if (RC != &AArch64::FPR128RegClass)
		return false;
		}

		if (MI.getOpcode() == AArch64::DUPv8i8gpr) {
		// We are expecting below case.
		//
		// %8:fpr64 = URSHRd killed %9:fpr64, 3
		// %10:gpr64all = COPY %8:fpr64
		// %11:gpr32 = COPY %10.sub_32:gpr64all
		// %12:fpr64 = DUPv8i8gpr killed %11:gpr32
		if (CopyMI->getOperand(1).getSubReg() != AArch64::sub_32)
		return false;

		const TargetRegisterClass *RC =
		MRI->getRegClass(CopyMI->getOperand(1).getReg());
		if (RC != &AArch64::FPR64RegClass)
		CopyMI = MRI->getUniqueVRegDef(CopyMI->getOperand(1).getReg());

		if (CopyMI->getOpcode() != TargetOpcode::COPY)
		return false;

		RC = MRI->getRegClass(CopyMI->getOperand(1).getReg());
		if (RC != &AArch64::FPR64RegClass)
		return false;
		}

		// Create DUPv8i8lane/DUPv8i16lane.
		unsigned Opc = (MI.getOpcode() == AArch64::DUPv8i16gpr)
		? AArch64::DUPv8i16lane
		: AArch64::DUPv8i8lane;
		Register DstReg = MI.getOperand(0).getReg();
		Register SrcReg = CopyMI->getOperand(1).getReg();
		MachineInstr *DUPlaneMI =
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
		.addUse(SrcReg, getRegState(MI.getOperand(1)))
		.addImm(0);

		LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *DUPlaneMI << "\n");
		(void)DUPlaneMI;

		MI.eraseFromParent();

		return true;
		}

bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {		bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))		if (skipFunction(MF.getFunction()))
return false;		return false;

TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());		TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = static_cast<const AArch64RegisterInfo *>(		TRI = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());		MF.getSubtarget().getRegisterInfo());
MLI = &getAnalysis<MachineLoopInfo>();		MLI = &getAnalysis<MachineLoopInfo>();
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	for (MachineInstr &MI : make_early_inc_range(MBB)) {
Changed = visitINSviGPR(MI, AArch64::INSvi16lane);		Changed = visitINSviGPR(MI, AArch64::INSvi16lane);
break;		break;
case AArch64::INSvi8gpr:		case AArch64::INSvi8gpr:
Changed = visitINSviGPR(MI, AArch64::INSvi8lane);		Changed = visitINSviGPR(MI, AArch64::INSvi8lane);
break;		break;
case AArch64::INSvi64lane:		case AArch64::INSvi64lane:
Changed = visitINSvi64lane(MI);		Changed = visitINSvi64lane(MI);
break;		break;
		case AArch64::DUPv8i16gpr:
		case AArch64::DUPv8i8gpr:
		Changed = visitDUPgpr(MI);
		break;
}		}
}		}
}		}

return Changed;		return Changed;
}		}

FunctionPass *llvm::createAArch64MIPeepholeOptPass() {		FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
return new AArch64MIPeepholeOpt();		return new AArch64MIPeepholeOpt();
}		}

llvm/test/CodeGen/AArch64/replace-dupgpr-with-duplane.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
				; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu \| FileCheck %s

				define <8 x i8> @test1(<8 x i8> noundef %a) {
				; CHECK-LABEL: test1:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: uaddlv h0, v0.8b
				; CHECK-NEXT: fmov w8, s0
				; CHECK-NEXT: and w8, w8, #0xffff
				; CHECK-NEXT: fmov d0, x8
				; CHECK-NEXT: urshr d0, d0, #3
				; CHECK-NEXT: dup v0.8b, v0.b[0]
				; CHECK-NEXT: ret
				entry:
				%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
				%0 = and i32 %vaddlv.i, 65535
				%conv = zext i32 %0 to i64
				%vrshr_n = tail call i64 @llvm.aarch64.neon.urshl.i64(i64 %conv, i64 -3)
				%conv1 = trunc i64 %vrshr_n to i8
				%vecinit.i = insertelement <8 x i8> undef, i8 %conv1, i64 0
				%vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
				ret <8 x i8> %vecinit7.i
				}

				declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)

				define <8 x i8> @test2(<8 x i8> noundef %a) {
				; CHECK-LABEL: test2:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: uaddlv h1, v0.8b
				; CHECK-NEXT: dup v0.8h, v1.h[0]
				; CHECK-NEXT: rshrn v0.8b, v0.8h, #3
				; CHECK-NEXT: ret
				entry:
				%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
				%0 = trunc i32 %vaddlv.i to i16
				%vecinit.i = insertelement <8 x i16> undef, i16 %0, i64 0
				%vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer
				%vrshrn_n2 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %vecinit7.i, i32 3)
				ret <8 x i8> %vrshrn_n2
				}

				declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
				declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)