This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
sve-intrinsics-int-arith-merging.ll

Differential D128606

[WIP][AArch64][DAGCombiner] Swap the operations of logical operation AND to match movprfx
AbandonedPublic

Authored by Allen on Jun 26 2022, 3:59 AM.

Download Raw Diff

Details

Reviewers

paulwalker-arm
dmgreen
david-arm

Diff Detail

Unit TestsFailed

	Time	Test
	60,130 ms	x64 debian > AddressSanitizer-x86_64-linux-dynamic.TestCases::scariness_score_test.cpp
	60,090 ms	x64 debian > AddressSanitizer-x86_64-linux.TestCases::scariness_score_test.cpp
	60,020 ms	x64 debian > libFuzzer.libFuzzer::fuzzer-leak.test

Event Timeline

Allen created this revision.Jun 26 2022, 3:59 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 26 2022, 3:59 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

Allen requested review of this revision.Jun 26 2022, 3:59 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 26 2022, 3:59 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B172076: Diff 440053.Jun 26 2022, 5:19 AM

Is this optimisation valid? The merging SVE intrinsics have strict rules about what happens to inactive lanes. For the llvm.aarch64.sve.and the inactive lanes are set to the matching lanes of the first operand. This means that the inactive lanes of the second operand play no role in the operation and thus the example in and_i64_zero_comm is not a zeroing and.

However, given the inactive lanes of the second operand play no role, this effectively means the select is redundant and can be optimised away as an instcombine before it gets to code generation. So I guess the question is whether you are seeing this issue in real code and thus it's worth implementing the instcombine.

Matt added a subscriber: Matt.Jun 28 2022, 2:22 PM

In D128606#3615200, @paulwalker-arm wrote:

Is this optimisation valid? The merging SVE intrinsics have strict rules about what happens to inactive lanes. For the llvm.aarch64.sve.and the inactive lanes are set to the matching lanes of the first operand. This means that the inactive lanes of the second operand play no role in the operation and thus the example in and_i64_zero_comm is not a zeroing and.

However, given the inactive lanes of the second operand play no role, this effectively means the select is redundant and can be optimised away as an instcombine before it gets to code generation. So I guess the question is whether you are seeing this issue in real code and thus it's worth implementing the instcombine.

Oh, sorry, and thanks @paulwalker-arm for your reminder . I forgot to use clang end-to-end to confirm the final assembly, At first thought it will be better performance to generate movprfx, without realizing that the select is redundant in this case.
Indeed, the instructions generated by the s113_tuned version are more efficient in the link https://gcc.godbolt.org/z/P14sb6MPq

Allen retitled this revision from [AArch64][DAGCombiner] Swap the operations of logical operation AND to match movprfx to [WIP][AArch64][DAGCombiner] Swap the operations of logical operation AND to match movprfx.Jun 30 2022, 5:11 AM

Just trying to cleanup my review list since we're agreed a different approach is required.

This revision now requires changes to proceed.Jul 15 2022, 9:04 AM

sorry, forget to adopt

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

46 lines

test/

CodeGen/

AArch64/

sve-intrinsics-int-arith-merging.ll

14 lines

Diff 440053

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 16,117 Lines • ▼ Show 20 Lines	if (MaxSVESize && MinSVESize == MaxSVESize) {
getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));		getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
return PatNumElts == (NumElts * VScale);		return PatNumElts == (NumElts * VScale);
}		}
}		}

return false;		return false;
}		}

		// Return true if the 2nd operand of specific intrinsic is zero.
		static bool isSwapPredicateZeroing(SDValue VSel, SDNode *N) {
		assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
		if (VSel.getOpcode() != ISD::VSELECT)
		return false;

		unsigned IID = getIntrinsicID(N);
		switch (IID) {
		// TODO: Add more intrinsic once we have test coverage.
		case Intrinsic::aarch64_sve_and:
		return isAllInactivePredicate(VSel.getOperand(2));
		}

		return false;
		}

// If a merged operation has no inactive lanes we can relax it to a predicated		// If a merged operation has no inactive lanes we can relax it to a predicated
// or unpredicated operation, which potentially allows better isel (perhaps		// or unpredicated operation, which potentially allows better isel (perhaps
// using immediate forms) or relaxing register reuse requirements.		// using immediate forms) or relaxing register reuse requirements.
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,		static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
SelectionDAG &DAG, bool UnpredOp = false,		SelectionDAG &DAG, bool UnpredOp = false,
bool SwapOperands = false) {		bool SwapOperands = false) {
assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");		assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");		assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
SDValue Pg = N->getOperand(1);		SDValue Pg = N->getOperand(1);
SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);		SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);		SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);

// ISD way to specify an all active predicate.		// ISD way to specify an all active predicate.
if (isAllActivePredicate(DAG, Pg)) {		if (isAllActivePredicate(DAG, Pg)) {
if (UnpredOp)		if (UnpredOp)
return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);		return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);

return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);		return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
}		}

// FUTURE: SplatVector(true)		// FUTURE: SplatVector(true)
return SDValue();		return SDValue();
}		}

		// Swap the operations when the 2nd operation is comming from a VSELECT
		// with predicate zeroing.
		//
		static SDValue tryCombineOpWithPredicateZeroing(
		unsigned IID, unsigned Opc, SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
		SelectionDAG &DAG, bool UnpredOp = false, bool SwapOperands = false) {
		assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");

		if (SDValue Res =
		convertMergedOpToPredOp(N, Opc, DAG, UnpredOp, SwapOperands))
		return Res;

		if (DCI.isAfterLegalizeDAG())
		return SDValue();

		// Swap to candidate for movprfx if it is predicate zeroing.
		SDValue Pg = N->getOperand(1);
		SDValue Op1 = N->getOperand(2);
		SDValue Op2 = N->getOperand(3);
		if (isSwapPredicateZeroing(Op2, N)) {
		SDValue ID = DAG.getTargetConstant(IID, SDLoc(N), MVT::i64);
		return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
		ID, Pg, Op2, Op1);
		}

		return SDValue();
		}

static SDValue performIntrinsicCombine(SDNode *N,		static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {		const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
unsigned IID = getIntrinsicID(N);		unsigned IID = getIntrinsicID(N);
switch (IID) {		switch (IID) {
default:		default:
break;		break;
▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines	case Intrinsic::aarch64_sve_fmul:
return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);		return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
case Intrinsic::aarch64_sve_add:		case Intrinsic::aarch64_sve_add:
return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);		return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
case Intrinsic::aarch64_sve_sub:		case Intrinsic::aarch64_sve_sub:
return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);		return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
case Intrinsic::aarch64_sve_subr:		case Intrinsic::aarch64_sve_subr:
return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);		return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
case Intrinsic::aarch64_sve_and:		case Intrinsic::aarch64_sve_and:
return convertMergedOpToPredOp(N, ISD::AND, DAG, true);		return tryCombineOpWithPredicateZeroing (IID, ISD::AND, N, DCI, DAG, true);
case Intrinsic::aarch64_sve_bic:		case Intrinsic::aarch64_sve_bic:
return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);		return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
case Intrinsic::aarch64_sve_eor:		case Intrinsic::aarch64_sve_eor:
return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);		return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
case Intrinsic::aarch64_sve_orr:		case Intrinsic::aarch64_sve_orr:
return convertMergedOpToPredOp(N, ISD::OR, DAG, true);		return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
case Intrinsic::aarch64_sve_sabd:		case Intrinsic::aarch64_sve_sabd:
return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);		return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
▲ Show 20 Lines • Show All 5,135 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll

	Show First 20 Lines • Show All 331 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer			%a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
	%out = call <vscale x 2 x i64> @llvm.aarch64.sve.and.nxv2i64(<vscale x 2 x i1> %pg,			%out = call <vscale x 2 x i64> @llvm.aarch64.sve.and.nxv2i64(<vscale x 2 x i1> %pg,
	<vscale x 2 x i64> %a_z,			<vscale x 2 x i64> %a_z,
	<vscale x 2 x i64> %b)			<vscale x 2 x i64> %b)
	ret <vscale x 2 x i64> %out			ret <vscale x 2 x i64> %out
	}			}

				; commutative operation
				define <vscale x 2 x i64> @and_i64_zero_comm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
				; CHECK-LABEL: and_i64_zero_comm:
				; CHECK: // %bb.0:
				; CHECK-NEXT: movprfx z0.d, p0/z, z0.d
				; CHECK-NEXT: and z0.d, p0/m, z0.d, z1.d
				; CHECK-NEXT: ret
				%a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
				%out = call <vscale x 2 x i64> @llvm.aarch64.sve.and.nxv2i64(<vscale x 2 x i1> %pg,
				<vscale x 2 x i64> %b,
				<vscale x 2 x i64> %a_z)
				ret <vscale x 2 x i64> %out
				}

	;			;
	; BIC			; BIC
	;			;

	define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {			define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
	; CHECK-LABEL: bic_i8_zero:			; CHECK-LABEL: bic_i8_zero:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mov z2.b, #0 // =0x0			; CHECK-NEXT: mov z2.b, #0 // =0x0
	▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines