This is an archive of the discontinued LLVM Phabricator instance.

Teach the AArch64 backend that vector reduction NEON instructions implicitly zero the high lanes of the result, meaning that we can eliminate explicit zeroing.
AbandonedPublic

Authored by resistor on Mar 24 2022, 2:42 PM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	60,030 ms	x64 debian > libFuzzer.libFuzzer::fuzzer-leak.test
	60,070 ms	x64 debian > libFuzzer.libFuzzer::large.test

Event Timeline

resistor created this revision.Mar 24 2022, 2:42 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 24 2022, 2:42 PM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

resistor requested review of this revision.Mar 24 2022, 2:42 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 24 2022, 2:42 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B156150: Diff 418051.Mar 24 2022, 3:20 PM

resistor added a reviewer: greened.Mar 29 2022, 10:51 AM

Ping

My worry with this is that the top lanes are not always defined to be zero by the DAG nodes. There is a comment in the header that says:

// Vector across-lanes addition
// Only the lower result lane is defined.

And they can be selected in a number of ways, things like ADDPv2i64p are defined to produce a scalar results which is inserted into an undef vector.

Maybe that's OK, but we are relying on shaky semantics. Whilst it is true that the ADDV/ADDP instructions clear the top bits (as do many other instruction that set s/d regs), it's not clear to me where that is ensured through the pipeline.

llvm/test/CodeGen/AArch64/vecreduce-zeroing.ll
6	We can usually remove dso_local and local_unnamed_addr #0 to clean up the tests a bit.
62	Can these be removed?

resistor abandoned this revision.Jan 9 2023, 7:26 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

33 lines

test/

CodeGen/

AArch64/

vecreduce-zeroing.ll

64 lines

Diff 418051

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,857 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
// SCALAR_TO_VECTOR, except for when we have a single-element constant vector		// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.		// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {		if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "		LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
"SCALAR_TO_VECTOR node\n");		"SCALAR_TO_VECTOR node\n");
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);		return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
}		}

		// Handle NEON vector reduction instructions that implicitly zero the high
		// lanes. If we are using a BUILD VECTOR to explicitly zero the high lanes,
		// then we can propagate the BUILD_VECTOR away entirely.
		// (build_vector (extract_vector_elt (umax ...), 0), 0...) -> (umax ...)
		if (usesOnlyOneConstantValue == true && NumConstantLanes == NumElts - 1) {
		SDValue Op0 = Op.getOperand(0);
		bool constantLanesAreZero = false;
		if (ConstantSDNode *cst =
		dyn_cast<ConstantSDNode>(ConstantValue.getNode())) {
		constantLanesAreZero = cst->isZero();
		} else if (ConstantFPSDNode *cst =
		dyn_cast<ConstantFPSDNode>(ConstantValue.getNode())) {
		constantLanesAreZero = cst->isExactlyValue(0.0);
		}
		if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && constantLanesAreZero) {
		SDValue Op0_0 = Op0.getOperand(0);
		SDValue Op0_1 = Op0.getOperand(1);
		ConstantSDNode *ConstantOp0_1 = dyn_cast<ConstantSDNode>(Op0_1.getNode());
		if (ConstantOp0_1 && ConstantOp0_1->isZero() &&
		(Op0_0.getOpcode() == AArch64ISD::UMAXV \|\|
		Op0_0.getOpcode() == AArch64ISD::UMINV \|\|
		Op0_0.getOpcode() == AArch64ISD::SMAXV \|\|
		Op0_0.getOpcode() == AArch64ISD::SMINV \|\|
		Op0_0.getOpcode() == AArch64ISD::UADDV \|\|
		Op0_0.getOpcode() == AArch64ISD::SADDV)) {
		// NOTE: It would be nice to handle FMAXNM/FMINNM here as well, but
		// they are currently modeled as intrinsics that return scalars,
		// which prevents this pattern from being matchable.
		return Op0_0;
		}
		}
		}

if (AllLanesExtractElt) {		if (AllLanesExtractElt) {
SDNode *Vector = nullptr;		SDNode *Vector = nullptr;
bool Even = false;		bool Even = false;
bool Odd = false;		bool Odd = false;
// Check whether the extract elements match the Even pattern <0,2,4,...> or		// Check whether the extract elements match the Even pattern <0,2,4,...> or
// the Odd pattern <1,3,5,...>.		// the Odd pattern <1,3,5,...>.
for (unsigned i = 0; i < NumElts; ++i) {		for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);		SDValue V = Op.getOperand(i);
▲ Show 20 Lines • Show All 9,998 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/vecreduce-zeroing.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
				; RUN: llc < %s \| FileCheck %s
				target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
				target triple = "aarch64-unknown-linux-gnu"

				define dso_local noundef <4 x i32> @umaxv(<4 x i32> noundef %0) local_unnamed_addr #0 {
				dmgreenUnsubmitted Not Done Reply Inline Actions We can usually remove dso_local and local_unnamed_addr #0 to clean up the tests a bit. dmgreen: We can usually remove dso_local and local_unnamed_addr #0 to clean up the tests a bit.
				; CHECK-LABEL: umaxv:
				; CHECK: // %bb.0:
				; CHECK-NEXT: umaxv s0, v0.4s
				; CHECK-NEXT: ret
				%2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %0)
				%3 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %2, i64 0
				ret <4 x i32> %3
				}

				define dso_local noundef <4 x i32> @uminv(<4 x i32> noundef %0) local_unnamed_addr #0 {
				; CHECK-LABEL: uminv:
				; CHECK: // %bb.0:
				; CHECK-NEXT: uminv s0, v0.4s
				; CHECK-NEXT: ret
				%2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %0)
				%3 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %2, i64 0
				ret <4 x i32> %3
				}

				define dso_local noundef <4 x i32> @smaxv(<4 x i32> noundef %0) local_unnamed_addr #0 {
				; CHECK-LABEL: smaxv:
				; CHECK: // %bb.0:
				; CHECK-NEXT: smaxv s0, v0.4s
				; CHECK-NEXT: ret
				%2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %0)
				%3 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %2, i64 0
				ret <4 x i32> %3
				}

				define dso_local noundef <4 x i32> @sminv(<4 x i32> noundef %0) local_unnamed_addr #0 {
				; CHECK-LABEL: sminv:
				; CHECK: // %bb.0:
				; CHECK-NEXT: sminv s0, v0.4s
				; CHECK-NEXT: ret
				%2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %0)
				%3 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %2, i64 0
				ret <4 x i32> %3
				}

				define dso_local noundef <4 x i32> @addv(<4 x i32> noundef %0) local_unnamed_addr #0 {
				; CHECK-LABEL: addv:
				; CHECK: // %bb.0:
				; CHECK-NEXT: addv s0, v0.4s
				; CHECK-NEXT: ret
				%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
				%3 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %2, i64 0
				ret <4 x i32> %3
				}

				declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) #1
				declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) #1
				declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #1
				declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #1
				declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1

				attributes #0 = { mustprogress nofree nosync nounwind readnone willreturn uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="128" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.2a" }
				dmgreenUnsubmitted Not Done Reply Inline Actions Can these be removed? dmgreen: Can these be removed?
				attributes #1 = { nofree nosync nounwind readnone willreturn }