This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
1/2
arm64-vshift.ll

Differential D153847

[AArch64] Remove vector shift instrinsic with shift amount zero
ClosedPublic

Authored by jaykang10 on Jun 27 2023, 2:13 AM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma
t.p.northover

Commits

rG0e4d5b139816: [AArch64] Remove vector shift instrinsic with shift amount zero

Summary

It looks gcc folds vector shift intrinsic with zero shift amount from below example.

#include <arm_neon.h>

inline void foo(int64x2_t a, int64x2_t b, int64_t *dst, int df) {
    int64x2_t df_s64 = vdupq_n_s64(df);
    a = vpaddq_s64(a, b); 
    a = vshlq_s64(a, df_s64);
    vst1q_s64(dst, a); 
} 

void bar(int64x2_t a, int64x2_t b, int64_t *dst) {
    foo(a, b, dst, 0); 
}

gcc output
bar:
	addp	v0.2d, v0.2d, v1.2d
	str	q0, [x0]
	ret

llvm output
bar:
	addp	v0.2d, v0.2d, v1.2d
	shl	v0.2d, v0.2d, #0
	str	q0, [x0]
	ret

It looks llvm's AArch64 target lowers the intrinsic to target custom node in SelectionDAG and is missing to fold the custom node with zero shift amount.
With this patch, the llvm output is as below.

bar:
	addp	v0.2d, v0.2d, v1.2d
	str	q0, [x0]
	ret

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jaykang10 created this revision.Jun 27 2023, 2:13 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 27 2023, 2:13 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.Jun 27 2023, 2:13 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 27 2023, 2:13 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

jaykang10 added reviewers: dmgreen, efriedma, t.p.northover.Jun 27 2023, 2:13 AM

Harbormaster completed remote builds in B241403: Diff 534876.Jun 27 2023, 2:47 AM

Thanks - looks mostly good to me. Like I said before, there might be some advantage to doing this in instcombine in order to doing the transform earlier, but this will be useful in DAG too.

llvm/test/CodeGen/AArch64/arm64-vshift.ll
3491	Apparently this one is not correct, as the sqshlu will round the input even with a zero shift. The others look OK.

jaykang10 added inline comments.Jun 27 2023, 8:10 AM

llvm/test/CodeGen/AArch64/arm64-vshift.ll
3491	Ah... I did not know that. If possible, can you let me know where I can find that the `sqshlu` will round the input even with a zero shift please?

Following @dmgreen's comment, updated code.

Harbormaster completed remote builds in B241498: Diff 534998.Jun 27 2023, 10:27 AM

Thanks. LGTM

This revision is now accepted and ready to land.Jun 28 2023, 4:30 AM

Closed by commit rG0e4d5b139816: [AArch64] Remove vector shift instrinsic with shift amount zero (authored by jaykang10). · Explain WhyJun 28 2023, 5:46 AM

This revision was automatically updated to reflect the committed changes.

jaykang10 added a commit: rG0e4d5b139816: [AArch64] Remove vector shift instrinsic with shift amount zero.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

4 lines

test/

CodeGen/

AArch64/

arm64-vshift.ll

94 lines

Diff 535347

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 18,610 Lines • ▼ Show 20 Lines	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
return SDValue();		return SDValue();

ShiftAmount = SplatValue.getSExtValue();		ShiftAmount = SplatValue.getSExtValue();
} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {		} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
ShiftAmount = CVN->getSExtValue();		ShiftAmount = CVN->getSExtValue();
} else		} else
return SDValue();		return SDValue();

		// If the shift amount is zero, remove the shift intrinsic.
		if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
		return N->getOperand(1);

unsigned Opcode;		unsigned Opcode;
bool IsRightShift;		bool IsRightShift;
switch (IID) {		switch (IID) {
default:		default:
llvm_unreachable("Unknown shift intrinsic");		llvm_unreachable("Unknown shift intrinsic");
case Intrinsic::aarch64_neon_sqshl:		case Intrinsic::aarch64_neon_sqshl:
Opcode = AArch64ISD::SQSHL_I;		Opcode = AArch64ISD::SQSHL_I;
IsRightShift = false;		IsRightShift = false;
▲ Show 20 Lines • Show All 7,077 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-vshift.ll

	Show First 20 Lines • Show All 3,429 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: ashr_v1i64:			; CHECK-LABEL: ashr_v1i64:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: neg d1, d1			; CHECK-NEXT: neg d1, d1
	; CHECK-NEXT: sshl d0, d0, d1			; CHECK-NEXT: sshl d0, d0, d1
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%c = ashr <1 x i64> %a, %b			%c = ashr <1 x i64> %a, %b
	ret <1 x i64> %c			ret <1 x i64> %c
	}			}

				define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: sqshl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: uqshl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: srshl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: urshl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				dmgreenUnsubmitted Not Done Reply Inline Actions Apparently this one is not correct, as the sqshlu will round the input even with a zero shift. The others look OK. dmgreen: Apparently this one is not correct, as the sqshlu will round the input even with a zero shift.
				jaykang10AuthorUnsubmitted Done Reply Inline Actions Ah... I did not know that. If possible, can you let me know where I can find that the `sqshlu` will round the input even with a zero shift please? jaykang10: Ah... I did not know that. If possible, can you let me know where I can find that the `sqshlu`…
				; CHECK-LABEL: sqshlu_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: sqshlu.2d v0, v0, #0
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: sshl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
				; CHECK-LABEL: ushl_zero_shift_amount:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: addp.2d v0, v0, v1
				; CHECK-NEXT: str q0, [x0]
				; CHECK-NEXT: ret
				entry:
				%vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
				%vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
				store <2 x i64> %vshlq_v2.i.i, ptr %dst, align 8
				ret void
				}

				declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)