This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
2/7
popcount.ll

Differential D81343

[AArch64] custom lowering for i128 popcount
ClosedPublic

Authored by shawnl on Jun 7 2020, 8:23 AM.

Download Raw Diff

Details

Reviewers

stuij
eli.friedman
efriedma

Summary

halves the number of CNT instructions generated

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

shawnl created this revision.Jun 7 2020, 8:23 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 7 2020, 8:23 AM

Herald added subscribers: llvm-commits, danielkiss, hiraditya, kristof.beyls. · View Herald Transcript

Harbormaster failed remote builds in B59393: Diff 269060!Jun 7 2020, 9:01 AM

Harbormaster completed remote builds in B59393: Diff 269060.Jun 7 2020, 10:38 AM

(If you're not using Arcanist, please upload patches with full context, -U100000.)

llvm/test/CodeGen/AArch64/popcount.ll
10	Why are we generating two loads here? Something related to the BITCAST legalization?

shawnl added inline comments.Jun 7 2020, 1:09 PM

llvm/test/CodeGen/AArch64/popcount.ll
10	Yes, it should be: ldr q0, [x0]

shawnl marked an inline comment as done.Jun 7 2020, 2:25 PM

shawnl added inline comments.

llvm/test/CodeGen/AArch64/popcount.ll

Yes, it is going to AArch64ISelLowering.cpp:14006

case ISD::LOAD: {
  assert(SDValue(N, 0).getValueType() == MVT::i128 &&
         "unexpected load's value type");
  LoadSDNode *LoadNode = cast<LoadSDNode>(N);
  if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
    // Non-volatile loads are optimized later in AArch64's load/store
    // optimizer.    // <======This is not happening
    return; 
  }

  SDValue Result = DAG.getMemIntrinsicNode(
      AArch64ISD::LDP, SDLoc(N),
      DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
      {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
      LoadNode->getMemOperand());

  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
                             Result.getValue(0), Result.getValue(1));
  Results.append({Pair, Result.getValue(2) /* Chain */});
  return;
}

shawnl marked an inline comment as not done.Jun 7 2020, 2:34 PM

shawnl added inline comments.

llvm/test/CodeGen/AArch64/popcount.ll
10	With `-O0` it outputs: ldr x8, [x0, #8] ldr d0, [x0] // implicit-def: $q1 mov v1.16b, v0.16b mov v1.d[1], x8

efriedma added inline comments.Jun 7 2020, 8:28 PM

llvm/test/CodeGen/AArch64/popcount.ll
10	Hmm. Really, I think the problem reduces to something like the following, which generates a similar ldr+add+ld1 sequence: define <2 x i64> @z(i64* nocapture nonnull readonly %p) { %b = load i64, i64* %p %p2 = getelementptr i64, i64* %p, i64 1 %bb = load i64, i64* %p2 %r1 = insertelement <2 x i64> zeroinitializer, i64 %b, i32 0 %r2 = insertelement <2 x i64> %r1, i64 %bb, i32 1 ret <2 x i64> %r2 } X86ISelLowering.cpp has some code specifically to handle this; see EltsFromConsecutiveLoads. Maybe some of it should be ported to AArch64.

update tests so review is not distracted from another optimization bug

shawnl marked an inline comment as done.Jun 8 2020, 2:21 AM

shawnl marked an inline comment as done.Jun 8 2020, 3:32 AM

shawnl added inline comments.

llvm/test/CodeGen/AArch64/popcount.ll
10	Why is this not optimized to: define <2 x i64> @z(i64* nocapture nonnull readonly %p) { %b = load i128, i128* %p, align 8 %r2 = bitcast <2 x i64> %r1, i128 %bb, i32 1 ret <2 x i64> %r2 }

LGTM

llvm/test/CodeGen/AArch64/popcount.ll
10	IR optimizations of that sort are very limited at the moment. Maybe something to look into for the vectorcombine pass. It might be hard for IR-level optimizations to catch all the interesting cases anyway, though, given how much of shuffle lowering happens in SelectionDAG.

This revision is now accepted and ready to land.Jun 8 2020, 2:08 PM

This was commited

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

13 lines

test/

CodeGen/

AArch64/

popcount.ll

56 lines

Diff 269060

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Context not available.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);	setOperationAction(ISD::CTPOP, MVT::i64, Custom);
		setOperationAction(ISD::CTPOP, MVT::i128, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);	setOperationAction(ISD::SDIVREM, VT, Expand);
Context not available.
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;	return UaddLV;
		} else if (VT == MVT::i128) {
		Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);

		SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
		SDValue UaddLV = DAG.getNode(
		ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
		DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

		return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
	}	}

	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");	"Unexpected type for custom ctpop lowering");
Context not available.
	case ISD::VECREDUCE_UMAX:	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;	return;

		case ISD::CTPOP:
		Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
		return;
	case AArch64ISD::SADDV:	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;	return;
	case AArch64ISD::UADDV:	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
Context not available.

llvm/test/CodeGen/AArch64/popcount.ll

Context not available.

	; Function Attrs: nobuiltin nounwind readonly	; Function Attrs: nobuiltin nounwind readonly
	define i8 @popcount128(i128* nocapture nonnull readonly %0) {	define i8 @popcount128(i128* nocapture nonnull readonly %0) {
	; CHECK-LABEL: popcount128:	; CHECK-LABEL: popcount128:
	; CHECK: // %bb.0: // %Entry	; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: ldp d1, d0, [x0]	; CHECK-NEXT: ldr d0, [x0]
	; CHECK-NEXT: cnt v0.8b, v0.8b	; CHECK-NEXT: add x8, x0, #8 // =8
	; CHECK-NEXT: cnt v1.8b, v1.8b	; CHECK-NEXT: ld1 { v0.d }[1], [x8]
		efriedmaUnsubmitted Not Done Reply Inline Actions Why are we generating two loads here? Something related to the BITCAST legalization? efriedma: Why are we generating two loads here? Something related to the BITCAST legalization?
		shawnlAuthorUnsubmitted Not Done Reply Inline Actions Yes, it should be: ldr q0, [x0] shawnl: Yes, it should be: > ldr q0, [x0]
		shawnlAuthorUnsubmitted Not Done Reply Inline Actions Yes, it is going to AArch64ISelLowering.cpp:14006 case ISD::LOAD: { assert(SDValue(N, 0).getValueType() == MVT::i128 && "unexpected load's value type"); LoadSDNode LoadNode = cast<LoadSDNode>(N); if (!LoadNode->isVolatile() \|\| LoadNode->getMemoryVT() != MVT::i128) { // Non-volatile loads are optimized later in AArch64's load/store // optimizer. // <======This is not happening return; } SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::LDP, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Result.getValue(0), Result.getValue(1)); Results.append({Pair, Result.getValue(2) / Chain /}); return; } shawnl:* Yes, it is going to AArch64ISelLowering.cpp:14006 ``` case ISD::LOAD: { assert(SDValue…
		shawnlAuthorUnsubmitted Not Done Reply Inline Actions With `-O0` it outputs: ldr x8, [x0, #8] ldr d0, [x0] // implicit-def: $q1 mov v1.16b, v0.16b mov v1.d[1], x8 shawnl: With `-O0` it outputs: ``` ldr x8, [x0, #8] ldr d0, [x0]…
		efriedmaUnsubmitted Done Reply Inline Actions Hmm. Really, I think the problem reduces to something like the following, which generates a similar ldr+add+ld1 sequence: define <2 x i64> @z(i64* nocapture nonnull readonly %p) { %b = load i64, i64* %p %p2 = getelementptr i64, i64* %p, i64 1 %bb = load i64, i64* %p2 %r1 = insertelement <2 x i64> zeroinitializer, i64 %b, i32 0 %r2 = insertelement <2 x i64> %r1, i64 %bb, i32 1 ret <2 x i64> %r2 } X86ISelLowering.cpp has some code specifically to handle this; see EltsFromConsecutiveLoads. Maybe some of it should be ported to AArch64. efriedma: Hmm. Really, I think the problem reduces to something like the following, which generates a…
		shawnlAuthorUnsubmitted Done Reply Inline Actions Why is this not optimized to: define <2 x i64> @z(i64* nocapture nonnull readonly %p) { %b = load i128, i128* %p, align 8 %r2 = bitcast <2 x i64> %r1, i128 %bb, i32 1 ret <2 x i64> %r2 } shawnl: Why is this not optimized to: ``` define <2 x i64> @z(i64* nocapture nonnull readonly %p) {…
		efriedmaUnsubmitted Not Done Reply Inline Actions IR optimizations of that sort are very limited at the moment. Maybe something to look into for the vectorcombine pass. It might be hard for IR-level optimizations to catch all the interesting cases anyway, though, given how much of shuffle lowering happens in SelectionDAG. efriedma: IR optimizations of that sort are very limited at the moment. Maybe something to look into for…
	; CHECK-NEXT: uaddlv h0, v0.8b	; CHECK-NEXT: cnt v0.16b, v0.16b
	; CHECK-NEXT: uaddlv h1, v1.8b	; CHECK-NEXT: uaddlv h0, v0.16b
	; CHECK-NEXT: fmov w8, s0	; CHECK-NEXT: fmov w0, s0
	; CHECK-NEXT: fmov w9, s1
	; CHECK-NEXT: add w0, w9, w8
	; CHECK-NEXT: ret	; CHECK-NEXT: ret
	Entry:	Entry:
	%1 = load i128, i128* %0, align 16	%1 = load i128, i128* %0, align 16
	%2 = tail call i128 @llvm.ctpop.i128(i128 %1)	%2 = tail call i128 @llvm.ctpop.i128(i128 %1)
	%3 = trunc i128 %2 to i8	%3 = trunc i128 %2 to i8
Context not available.

	; Function Attrs: nobuiltin nounwind readonly	; Function Attrs: nobuiltin nounwind readonly
	define i16 @popcount256(i256* nocapture nonnull readonly %0) {	define i16 @popcount256(i256* nocapture nonnull readonly %0) {
	; CHECK-LABEL: popcount256:	; CHECK-LABEL: popcount256:
	; CHECK: // %bb.0: // %Entry	; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: ldp d1, d0, [x0, #16]	; CHECK-NEXT: ldr d0, [x0, #16]
	; CHECK-NEXT: ldp d3, d2, [x0]	; CHECK-NEXT: ldr d1, [x0]
	; CHECK-NEXT: cnt v0.8b, v0.8b	; CHECK-NEXT: add x8, x0, #8 // =8
	; CHECK-NEXT: cnt v1.8b, v1.8b	; CHECK-NEXT: add x9, x0, #24 // =24
	; CHECK-NEXT: uaddlv h0, v0.8b	; CHECK-NEXT: ld1 { v0.d }[1], [x9]
	; CHECK-NEXT: cnt v2.8b, v2.8b	; CHECK-NEXT: ld1 { v1.d }[1], [x8]
	; CHECK-NEXT: uaddlv h1, v1.8b	; CHECK-NEXT: cnt v0.16b, v0.16b
		; CHECK-NEXT: cnt v1.16b, v1.16b
		; CHECK-NEXT: uaddlv h0, v0.16b
		; CHECK-NEXT: uaddlv h1, v1.16b
	; CHECK-NEXT: fmov w8, s0	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: cnt v0.8b, v3.8b
	; CHECK-NEXT: uaddlv h2, v2.8b
	; CHECK-NEXT: fmov w9, s1	; CHECK-NEXT: fmov w9, s1
	; CHECK-NEXT: uaddlv h0, v0.8b
	; CHECK-NEXT: fmov w10, s2
	; CHECK-NEXT: add w8, w9, w8
	; CHECK-NEXT: fmov w9, s0
	; CHECK-NEXT: add w9, w9, w10
	; CHECK-NEXT: add w0, w9, w8	; CHECK-NEXT: add w0, w9, w8
	; CHECK-NEXT: ret	; CHECK-NEXT: ret
	Entry:	Entry:
	%1 = load i256, i256* %0, align 16	%1 = load i256, i256* %0, align 16
	%2 = tail call i256 @llvm.ctpop.i256(i256 %1)	%2 = tail call i256 @llvm.ctpop.i256(i256 %1)
Context not available.
	declare i256 @llvm.ctpop.i256(i256)	declare i256 @llvm.ctpop.i256(i256)

	define <1 x i128> @popcount1x128(<1 x i128> %0) {	define <1 x i128> @popcount1x128(<1 x i128> %0) {
	; CHECK-LABEL: popcount1x128:	; CHECK-LABEL: popcount1x128:
	; CHECK: // %bb.0: // %Entry	; CHECK: // %bb.0: // %Entry
	; CHECK-NEXT: fmov d0, x1	; CHECK-NEXT: fmov d0, x0
	; CHECK-NEXT: fmov d1, x0	; CHECK-NEXT: mov v0.d[1], x1
	; CHECK-NEXT: cnt v0.8b, v0.8b	; CHECK-NEXT: cnt v0.16b, v0.16b
	; CHECK-NEXT: cnt v1.8b, v1.8b	; CHECK-NEXT: movi v1.2d, #0000000000000000
	; CHECK-NEXT: uaddlv h0, v0.8b	; CHECK-NEXT: uaddlv h0, v0.16b
	; CHECK-NEXT: uaddlv h1, v1.8b	; CHECK-NEXT: mov x1, v1.d[1]
	; CHECK-NEXT: movi v2.2d, #0000000000000000	; CHECK-NEXT: fmov w0, s0
	; CHECK-NEXT: fmov w8, s0
	; CHECK-NEXT: fmov w9, s1
	; CHECK-NEXT: add x0, x9, x8
	; CHECK-NEXT: mov x1, v2.d[1]
	; CHECK-NEXT: ret	; CHECK-NEXT: ret
	Entry:	Entry:
	%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)	%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
	ret <1 x i128> %1	ret <1 x i128> %1
	}	}
Context not available.

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] custom lowering for i128 popcountClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 269060

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/popcount.ll

[AArch64] custom lowering for i128 popcount
ClosedPublic