This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
aarch64-neon-vector-insert-uaddlv.ll
-
neon-addlv.ll

Differential D158613

[AArch64] Mark known zero for high 16-bits of uaddlv intrinsic output with v8i8
ClosedPublic

Authored by jaykang10 on Aug 23 2023, 6:35 AM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma
t.p.northover

Commits

rG82e851a407c5: [AArch64] Change bound for known zero bits of uaddlv intrinsic

Summary

If llvm.aarch64.neon.uaddlv intrinsic has v8i8 type input, the it returns 16-bits value.
clang generates llvm.aarch64.neon.uaddlv.i32.v8i8 and trunc to i16 for vaddlv_u8 neon intrinsic. It causes additional and 0xffff instruction from attached example as below.

foo:                                    // @foo
        uaddlv  h0, v0.8b
        fmov    w8, s0
        and     w0, w8, #0xffff
        ret

If we mark know zero for high 16-bits of uaddlv intrinsic output with v8i8, we can avoid the additional and 0xfff.

foo:                                    // @foo
        uaddlv  h0, v0.8b
        fmov    w0, s0
        ret

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jaykang10 created this revision.Aug 23 2023, 6:35 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 23 2023, 6:35 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.Aug 23 2023, 6:35 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 23 2023, 6:35 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B254331: Diff 552691.Aug 23 2023, 7:52 AM

LGTM, thanks.

This revision is now accepted and ready to land.Aug 24 2023, 2:00 AM

Can we also handle v16i8?

Is it worth trying to use a more precise bound, instead of just the approximate power of two? The largest number of bits that can actually be set for a v8i8 is 11 (the number of bits set in 8*255). If we're using a more precise bound like this, we could also handle other forms of uaddlv.

Thanks for comments.

Can we also handle v16i8?

Yep, let me add the type.

Is it worth trying to use a more precise bound, instead of just the approximate power of two? The largest number of bits that can actually be set for a v8i8 is 11 (the number of bits set in 8*255). If we're using a more precise bound like this, we could also handle other forms of uaddlv.

Yep, I think it would be good to make more bits zero. Let me update the bound.

Following @efriedma's comment, changed the bound to 11 and supported v16i8 type.

Harbormaster completed remote builds in B255449: Diff 554234.Aug 29 2023, 3:05 AM

@efriedma I have updated the patch. Please check it.

Fixed wrong bound for v16i8.

Harbormaster completed remote builds in B255461: Diff 554251.Aug 29 2023, 4:33 AM

It looks like you uploaded a diff on top of the first patch instead of the whole patch? Please make sure you commit the right thing.

LGTM with that fixed

Sorry, I pushed the first patch last week without mentioning this review accidentally...
Let me push this patch on the top of the first one.
Thanks for comments.

This revision was landed with ongoing or failed builds.Aug 30 2023, 12:22 AM

Closed by commit rG82e851a407c5: [AArch64] Change bound for known zero bits of uaddlv intrinsic (authored by jaykang10). · Explain Why

This revision was automatically updated to reflect the committed changes.

jaykang10 added a commit: rG82e851a407c5: [AArch64] Change bound for known zero bits of uaddlv intrinsic.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

7 lines

test/

CodeGen/

AArch64/

aarch64-neon-vector-insert-uaddlv.ll

2 lines

neon-addlv.ll

18 lines

Diff 554601

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,159 Lines • ▼ Show 20 Lines	void AArch64TargetLowering::computeKnownBitsForTargetNode(
case ISD::INTRINSIC_VOID: {		case ISD::INTRINSIC_VOID: {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();		unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {		switch (IntNo) {
default:		default:
break;		break;
case Intrinsic::aarch64_neon_uaddlv: {		case Intrinsic::aarch64_neon_uaddlv: {
MVT VT = Op.getOperand(1).getValueType().getSimpleVT();		MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
unsigned BitWidth = Known.getBitWidth();		unsigned BitWidth = Known.getBitWidth();
if (VT == MVT::v8i8) {		if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
assert(BitWidth >= 16 && "Unexpected width!");		unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);		assert(BitWidth >= Bound && "Unexpected width!");
		APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);
Known.Zero \|= Mask;		Known.Zero \|= Mask;
}		}
break;		break;
}		}
case Intrinsic::aarch64_neon_umaxv:		case Intrinsic::aarch64_neon_umaxv:
case Intrinsic::aarch64_neon_uminv: {		case Intrinsic::aarch64_neon_uminv: {
// Figure out the datatype of the vector operand. The UMINV instruction		// Figure out the datatype of the vector operand. The UMINV instruction
// will zero extend the result, so we can mark as known zero all the		// will zero extend the result, so we can mark as known zero all the
▲ Show 20 Lines • Show All 23,919 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

	Show First 20 Lines • Show All 294 Lines • ▼ Show 20 Lines

	define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {			define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {
	; CHECK-LABEL: insert_vec_v8i8_uaddlv_from_v8i8:			; CHECK-LABEL: insert_vec_v8i8_uaddlv_from_v8i8:
	; CHECK: ; %bb.0: ; %entry			; CHECK: ; %bb.0: ; %entry
	; CHECK-NEXT: movi.2d v0, #0000000000000000			; CHECK-NEXT: movi.2d v0, #0000000000000000
	; CHECK-NEXT: stp xzr, xzr, [x0, #16]			; CHECK-NEXT: stp xzr, xzr, [x0, #16]
	; CHECK-NEXT: uaddlv.8b h1, v0			; CHECK-NEXT: uaddlv.8b h1, v0
	; CHECK-NEXT: mov.h v0[0], v1[0]			; CHECK-NEXT: mov.h v0[0], v1[0]
	; CHECK-NEXT: bic.4h v0, #255, lsl #8			; CHECK-NEXT: bic.4h v0, #7, lsl #8
	; CHECK-NEXT: ushll.4s v0, v0, #0			; CHECK-NEXT: ushll.4s v0, v0, #0
	; CHECK-NEXT: ucvtf.4s v0, v0			; CHECK-NEXT: ucvtf.4s v0, v0
	; CHECK-NEXT: str q0, [x0]			; CHECK-NEXT: str q0, [x0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	entry:			entry:
	%vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> zeroinitializer)			%vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> zeroinitializer)
	%1 = trunc i32 %vaddlv to i8			%1 = trunc i32 %vaddlv to i8
	▲ Show 20 Lines • Show All 177 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/neon-addlv.ll

Show First 20 Lines • Show All 147 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A		%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)		%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)		%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
ret i32 %tmp5		ret i32 %tmp5
}		}

declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) nounwind readnone		declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) nounwind readnone

define i32 @uaddlv_known_bits(<8 x i8> %a) {		define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
; CHECK-LABEL: uaddlv_known_bits:		; CHECK-LABEL: uaddlv_known_bits_v8i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: uaddlv h0, v0.8b		; CHECK-NEXT: uaddlv h0, v0.8b
; CHECK-NEXT: fmov w0, s0		; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%tmp1 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)		%tmp1 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
%tmp2 = and i32 %tmp1, 65535		%tmp2 = and i32 %tmp1, 65535
ret i32 %tmp2		ret i32 %tmp2
}		}

		declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone

		define i32 @uaddlv_known_bits_v16i8(<16 x i8> %a) {
		; CHECK-LABEL: uaddlv_known_bits_v16i8:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: uaddlv h0, v0.16b
		; CHECK-NEXT: fmov w0, s0
		; CHECK-NEXT: ret
		entry:
		%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
		%0 = and i32 %vaddlv.i, 65535
		ret i32 %0
		}