This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/
-
CodeGen/GlobalISel/
-
GlobalISel/
-
LegalizerHelper.cpp
-
Target/AArch64/GISel/
-
AArch64/
-
GISel/
-
AArch64LegalizerInfo.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
GlobalISel/
-
legalize-ctpop.mir
-
popcount.ll

Differential D106494

[AArch64][GlobalISel] Legalize ctpop s128
ClosedPublic

Authored by jroelofs on Jul 21 2021, 1:27 PM.

Download Raw Diff

Details

Reviewers

aemerson
paquette

Commits

rG98f38c151b5a: [AArch64][GlobalISel] Legalize ctpop s128
rG97e95fea53fc: [AArch64][GlobalISel] Legalize ctpop s128

Summary

~Marking as "WIP" because the code generated for the zext i32 -> i128 is less than ideal, which reflects poorly on ctpop 256.~

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jroelofs created this revision.Jul 21 2021, 1:27 PM

Herald added subscribers: danielkiss, hiraditya, kristof.beyls, rovka. · View Herald TranscriptJul 21 2021, 1:27 PM

jroelofs requested review of this revision.Jul 21 2021, 1:27 PM

Herald added a project: Restricted Project. · View Herald TranscriptJul 21 2021, 1:27 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B115412: Diff 360582.Jul 21 2021, 4:31 PM

Do you have any idea of how to improve the zext behaviour?

In D106494#2898756, @paquette wrote:

Do you have any idea of how to improve the zext behaviour?

I was thinking about making narrowScalar do: zext(add(trunc(ctpop(hi)), trunc(ctpop(lo))) with the hope that some combine folds the inner trunc(zext(ctpop(x))) => ctpop(x), but I haven't tried this yet.

In D106494#2898851, @jroelofs wrote:

In D106494#2898756, @paquette wrote:

Do you have any idea of how to improve the zext behaviour?

I was thinking about making narrowScalar do: zext(add(trunc(ctpop(hi)), trunc(ctpop(lo))) with the hope that some combine folds the inner trunc(zext(ctpop(x))) => ctpop(x), but I haven't tried this yet.

For the s32 -> s64 case, that should be folded to a G_ZEXT by D106768

In D106494#2903324, @aemerson wrote:

In D106494#2898851, @jroelofs wrote:

In D106494#2898756, @paquette wrote:

Do you have any idea of how to improve the zext behaviour?

I was thinking about making narrowScalar do: zext(add(trunc(ctpop(hi)), trunc(ctpop(lo))) with the hope that some combine folds the inner trunc(zext(ctpop(x))) => ctpop(x), but I haven't tried this yet.

For the s32 -> s64 case, that should be folded to a G_ZEXT by D106768

Ah but these are going to be assigned to the FPR banks. We might be able to recognize the extend to s128 pattern and select the optimal code during selection.

Rebased.

Also added zext(add(trunc(ctpop(lo)), trunc(ctpop(hi)))) narrowing of the add to improve things for ctpop 256.

LGTM. I don't think we need to have perfect codegen first time if we're adding support from scratch.

This revision is now accepted and ready to land.Jul 26 2021, 3:08 PM

This revision was landed with ongoing or failed builds.Jul 26 2021, 4:34 PM

Closed by commit rG97e95fea53fc: [AArch64][GlobalISel] Legalize ctpop s128 (authored by jroelofs). · Explain Why

This revision was automatically updated to reflect the committed changes.

jroelofs added a commit: rG97e95fea53fc: [AArch64][GlobalISel] Legalize ctpop s128.

Harbormaster completed remote builds in B116283: Diff 361810.Jul 26 2021, 4:45 PM

jroelofs added a reverting change: rGf2e8e46d7863: Revert "[AArch64][GlobalISel] Legalize ctpop s128".Jul 26 2021, 5:07 PM

jroelofs added a commit: rG98f38c151b5a: [AArch64][GlobalISel] Legalize ctpop s128.Aug 5 2021, 11:55 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

GlobalISel/

LegalizerHelper.cpp

10 lines

Target/

AArch64/

GISel/

AArch64LegalizerInfo.cpp

7 lines

test/

CodeGen/

AArch64/

GlobalISel/

legalize-ctpop.mir

24 lines

popcount.ll

58 lines

Diff 361850

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,616 Lines • ▼ Show 20 Lines	LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());		LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
unsigned NarrowSize = NarrowTy.getSizeInBits();		unsigned NarrowSize = NarrowTy.getSizeInBits();

if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {		if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));		auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));

auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));		auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));		auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));

		LLT CountTy = LLT::scalar(Log2_64_Ceil(SrcTy.getSizeInBits()));
		if (CountTy.getSizeInBits() < DstTy.getSizeInBits()) {
		LoCTPOP = MIRBuilder.buildTrunc(CountTy, LoCTPOP);
		HiCTPOP = MIRBuilder.buildTrunc(CountTy, HiCTPOP);
		auto Add = MIRBuilder.buildAdd(CountTy, HiCTPOP, LoCTPOP);
		MIRBuilder.buildZExt(DstReg, Add);
		} else
MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);		MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);

MI.eraseFromParent();		MI.eraseFromParent();
return Legalized;		return Legalized;
}		}

return UnableToLegalize;		return UnableToLegalize;
}		}

▲ Show 20 Lines • Show All 1,662 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Show First 20 Lines • Show All 758 Lines • ▼ Show 20 Lines	getActionDefinitionsBuilder(G_ROTR)
return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;		return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
})		})
.lower();		.lower();
getActionDefinitionsBuilder(G_ROTL).lower();		getActionDefinitionsBuilder(G_ROTL).lower();

getActionDefinitionsBuilder({G_SBFX, G_UBFX})		getActionDefinitionsBuilder({G_SBFX, G_UBFX})
.customFor({{s32, s32}, {s64, s64}});		.customFor({{s32, s32}, {s64, s64}});

// TODO: Custom legalization for s128
// TODO: Use generic lowering when custom lowering is not possible.		// TODO: Use generic lowering when custom lowering is not possible.
auto always = [=](const LegalityQuery &Q) { return true; };		auto always = [=](const LegalityQuery &Q) { return true; };
getActionDefinitionsBuilder(G_CTPOP)		getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{v8s8, v8s8}, {v16s8, v16s8}})		.legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
.clampScalar(0, s32, s128)		.clampScalar(0, s32, s128)
.widenScalarToNextPow2(0)		.widenScalarToNextPow2(0)
.minScalarEltSameAsIf(always, 1, 0)		.minScalarEltSameAsIf(always, 1, 0)
.maxScalarEltSameAsIf(always, 1, 0)		.maxScalarEltSameAsIf(always, 1, 0)
.customFor({{s32, s32},		.customFor({{s32, s32},
{s64, s64},		{s64, s64},
		{s128, s128},
{v2s64, v2s64},		{v2s64, v2s64},
{v2s32, v2s32},		{v2s32, v2s32},
{v4s32, v4s32},		{v4s32, v4s32},
{v4s16, v4s16},		{v4s16, v4s16},
{v8s16, v8s16}});		{v8s16, v8s16}});

getLegacyLegalizerInfo().computeTables();		getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());		verify(*ST.getInstrInfo());
▲ Show 20 Lines • Show All 360 Lines • ▼ Show 20 Lines	assert(Ty == MRI.getType(Dst) &&
"Expected src and dst to have the same type!");		"Expected src and dst to have the same type!");
unsigned Size = Ty.getSizeInBits();		unsigned Size = Ty.getSizeInBits();

// Pre-conditioning: widen Val up to the nearest vector type.		// Pre-conditioning: widen Val up to the nearest vector type.
// s32,s64,v4s16,v2s32 -> v8i8		// s32,s64,v4s16,v2s32 -> v8i8
// v8s16,v4s32,v2s64 -> v16i8		// v8s16,v4s32,v2s64 -> v16i8
LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);		LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
if (Ty.isScalar()) {		if (Ty.isScalar()) {
// TODO: Handle s128.		assert((Size == 32 \|\| Size == 64 \|\| Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
assert((Size == 32 \|\| Size == 64) && "Expected only 32 or 64 bit scalars!");
if (Size == 32) {		if (Size == 32) {
Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);		Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
}		}
}		}
Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);		Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);

// Count bits in each byte-sized lane.		// Count bits in each byte-sized lane.
auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);		auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
Show All 29 Lines	bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
MachineInstrBuilder UADD;		MachineInstrBuilder UADD;
for (LLT HTy : HAddTys) {		for (LLT HTy : HAddTys) {
UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /HasSideEffects =/false)		UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /HasSideEffects =/false)
.addUse(HSum);		.addUse(HSum);
HSum = UADD.getReg(0);		HSum = UADD.getReg(0);
}		}

// Post-conditioning.		// Post-conditioning.
if (Ty.isScalar() && Size == 64)		if (Ty.isScalar() && (Size == 64 \|\| Size == 128))
MIRBuilder.buildZExt(Dst, UADD);		MIRBuilder.buildZExt(Dst, UADD);
else		else
UADD->getOperand(0).setReg(Dst);		UADD->getOperand(0).setReg(Dst);
MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}

bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(		bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	bb.0:
; CHECK: RET_ReallyLR implicit $x0		; CHECK: RET_ReallyLR implicit $x0
%copy:_(s64) = COPY $x0		%copy:_(s64) = COPY $x0
%ctpop:_(s64) = G_CTPOP %copy(s64)		%ctpop:_(s64) = G_CTPOP %copy(s64)
$x0 = COPY %ctpop(s64)		$x0 = COPY %ctpop(s64)
RET_ReallyLR implicit $x0		RET_ReallyLR implicit $x0

...		...
---		---
		name: s128_lower
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $q0
		; CHECK-LABEL: name: s128_lower
		; CHECK: liveins: $q0
		; CHECK: %copy:_(s128) = COPY $q0
		; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST %copy(s128)
		; CHECK: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
		; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>)
		; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
		; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32)
		; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
		; CHECK: %ctpop:_(s128) = G_MERGE_VALUES [[MV]](s64), [[C1]](s64)
		; CHECK: $q0 = COPY %ctpop(s128)
		; CHECK: RET_ReallyLR implicit $q0
		%copy:_(s128) = COPY $q0
		%ctpop:_(s128) = G_CTPOP %copy(s128)
		$q0 = COPY %ctpop(s128)
		RET_ReallyLR implicit $q0

		...
		---
name: widen_s16		name: widen_s16
tracksRegLiveness: true		tracksRegLiveness: true
body: \|		body: \|
bb.0:		bb.0:
liveins: $w0		liveins: $w0

; CHECK-LABEL: name: widen_s16		; CHECK-LABEL: name: widen_s16
; CHECK: liveins: $w0		; CHECK: liveins: $w0
▲ Show 20 Lines • Show All 209 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/popcount.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown \| FileCheck %s			; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown \| FileCheck %s

	; Function Attrs: nobuiltin nounwind readonly			; Function Attrs: nobuiltin nounwind readonly
	define i8 @popcount128(i128* nocapture nonnull readonly %0) {			define i8 @popcount128(i128* nocapture nonnull readonly %0) {
	; CHECK-LABEL: popcount128:			; CHECK-LABEL: popcount128:
	; CHECK: // %bb.0: // %Entry			; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: ldr x8, [x0, #8]			; CHECK-NEXT: ldr q0, [x0]
	; CHECK-NEXT: ldr d1, [x0]
	; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: mov v0.d[1], x8
	; CHECK-NEXT: cnt v0.16b, v0.16b			; CHECK-NEXT: cnt v0.16b, v0.16b
	; CHECK-NEXT: uaddlv h1, v0.16b			; CHECK-NEXT: uaddlv h1, v0.16b
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: mov v0.16b, v1.16b			; CHECK-NEXT: mov v0.16b, v1.16b
				; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: fmov w0, s0			; CHECK-NEXT: fmov w0, s0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	Entry:			Entry:
	%1 = load i128, i128* %0, align 16			%1 = load i128, i128* %0, align 16
	%2 = tail call i128 @llvm.ctpop.i128(i128 %1)			%2 = tail call i128 @llvm.ctpop.i128(i128 %1)
	%3 = trunc i128 %2 to i8			%3 = trunc i128 %2 to i8
	ret i8 %3			ret i8 %3
	}			}

	; Function Attrs: nounwind readnone speculatable willreturn			; Function Attrs: nounwind readnone speculatable willreturn
	declare i128 @llvm.ctpop.i128(i128)			declare i128 @llvm.ctpop.i128(i128)

	; Function Attrs: nobuiltin nounwind readonly			; Function Attrs: nobuiltin nounwind readonly
	define i16 @popcount256(i256* nocapture nonnull readonly %0) {			define i16 @popcount256(i256* nocapture nonnull readonly %0) {
	; CHECK-LABEL: popcount256:			; CHECK-LABEL: popcount256:
	; CHECK: // %bb.0: // %Entry			; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: ldr x8, [x0, #8]			; CHECK-NEXT: ldr x11, [x0]
	; CHECK-NEXT: ldr x9, [x0, #24]			; CHECK-NEXT: ldr x10, [x0, #8]
	; CHECK-NEXT: ldr d1, [x0, #16]			; CHECK-NEXT: ldr x9, [x0, #16]
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: ldr x8, [x0, #24]
	; CHECK-NEXT: mov v0.16b, v1.16b			; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: mov v0.d[1], x9			; CHECK-NEXT: mov v0.d[0], x11
				; CHECK-NEXT: mov v0.d[1], x10
				; CHECK-NEXT: // implicit-def: $q1
				; CHECK-NEXT: mov v1.d[0], x9
				; CHECK-NEXT: mov v1.d[1], x8
	; CHECK-NEXT: cnt v0.16b, v0.16b			; CHECK-NEXT: cnt v0.16b, v0.16b
	; CHECK-NEXT: uaddlv h1, v0.16b			; CHECK-NEXT: uaddlv h2, v0.16b
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: mov v0.16b, v1.16b			; CHECK-NEXT: mov v0.16b, v2.16b
				; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
				; CHECK-NEXT: cnt v1.16b, v1.16b
				; CHECK-NEXT: uaddlv h2, v1.16b
				; CHECK-NEXT: // implicit-def: $q1
				; CHECK-NEXT: mov v1.16b, v2.16b
				; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1
				; CHECK-NEXT: fmov w8, s1
	; CHECK-NEXT: fmov w9, s0			; CHECK-NEXT: fmov w9, s0
	; CHECK-NEXT: ldr d1, [x0]			; CHECK-NEXT: add w8, w8, w9
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: // implicit-def: $w9
	; CHECK-NEXT: mov v0.16b, v1.16b			; CHECK-NEXT: // kill: def $x8 killed $w8
	; CHECK-NEXT: mov v0.d[1], x8			; CHECK-NEXT: // kill: def $x9 killed $w9
	; CHECK-NEXT: cnt v0.16b, v0.16b			; CHECK-NEXT: bfi x8, x9, #32, #32
	; CHECK-NEXT: uaddlv h1, v0.16b			; CHECK-NEXT: and x8, x8, #0xff
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: mov w0, w8
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: add w0, w8, w9
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	Entry:			Entry:
	%1 = load i256, i256* %0, align 16			%1 = load i256, i256* %0, align 16
	%2 = tail call i256 @llvm.ctpop.i256(i256 %1)			%2 = tail call i256 @llvm.ctpop.i256(i256 %1)
	%3 = trunc i256 %2 to i16			%3 = trunc i256 %2 to i16
	ret i16 %3			ret i16 %3
	}			}

	; Function Attrs: nounwind readnone speculatable willreturn			; Function Attrs: nounwind readnone speculatable willreturn
	declare i256 @llvm.ctpop.i256(i256)			declare i256 @llvm.ctpop.i256(i256)

	define <1 x i128> @popcount1x128(<1 x i128> %0) {			define <1 x i128> @popcount1x128(<1 x i128> %0) {
	; CHECK-LABEL: popcount1x128:			; CHECK-LABEL: popcount1x128:
	; CHECK: // %bb.0: // %Entry			; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: fmov d0, x0			; CHECK-NEXT: mov v0.d[0], x0
	; CHECK-NEXT: mov v0.d[1], x1			; CHECK-NEXT: mov v0.d[1], x1
	; CHECK-NEXT: cnt v0.16b, v0.16b			; CHECK-NEXT: cnt v0.16b, v0.16b
	; CHECK-NEXT: uaddlv h1, v0.16b			; CHECK-NEXT: uaddlv h1, v0.16b
	; CHECK-NEXT: // implicit-def: $q0			; CHECK-NEXT: // implicit-def: $q0
	; CHECK-NEXT: mov v0.16b, v1.16b			; CHECK-NEXT: mov v0.16b, v1.16b
				; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: fmov w0, s0			; CHECK-NEXT: fmov w0, s0
				; CHECK-NEXT: mov w8, wzr
	; CHECK-NEXT: // kill: def $x0 killed $w0			; CHECK-NEXT: // kill: def $x0 killed $w0
	; CHECK-NEXT: movi v0.2d, #0000000000000000			; CHECK-NEXT: // kill: def $x8 killed $w8
	; CHECK-NEXT: mov x1, v0.d[1]			; CHECK-NEXT: bfi x0, x8, #32, #32
				; CHECK-NEXT: mov x1, xzr
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	Entry:			Entry:
	%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)			%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
	ret <1 x i128> %1			ret <1 x i128> %1
	}			}

	declare <1 x i128> @llvm.ctpop.v1.i128(<1 x i128>)			declare <1 x i128> @llvm.ctpop.v1.i128(<1 x i128>)