Diff 448102

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,180 Lines • ▼ Show 20 Lines	if (OrigTy.getSizeInBits() >= 64)
return N;		return N;

// Must extend size to at least 64 bits to be used as an operand for VMULL.		// Must extend size to at least 64 bits to be used as an operand for VMULL.
EVT NewVT = getExtensionTo64Bits(OrigTy);		EVT NewVT = getExtensionTo64Bits(OrigTy);

return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);		return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
}		}

		static bool isOperandOfHigherHalf(SDValue &Op) {
		SDNode *OpNode = Op.getNode();
		if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
		return false;

		ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
		if (!C \|\| C->getZExtValue() != 1)
		return false;

		EVT VT = OpNode->getOperand(0).getValueType();

		return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2;
		}

		static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) {
		return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2);
		}

static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,		static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
bool isSigned) {		bool isSigned) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);

if (N->getOpcode() != ISD::BUILD_VECTOR)		if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;		return false;

for (const SDValue &Elt : N->op_values()) {		for (const SDValue &Elt : N->op_values()) {
▲ Show 20 Lines • Show All 324 Lines • ▼ Show 20 Lines	if (Ty == MVT::i64) {
Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);		Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);		return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {		} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));		return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
} else {		} else {
report_fatal_error("Unexpected type for AArch64 NEON intrinic");		report_fatal_error("Unexpected type for AArch64 NEON intrinic");
}		}
}		}
		case Intrinsic::aarch64_neon_pmull64: {
		SDValue Op1 = Op.getOperand(1);
		SDValue Op2 = Op.getOperand(2);

		// If both operands are higher half of two source SIMD & FP registers,
		// ISel could make use of tablegen patterns to emit PMULL2. So do not
		// legalize i64 to v1i64.
		if (areOperandsOfHigherHalf(Op1, Op2))
		return SDValue();
		dmgreenUnsubmitted Done Reply Inline Actions Can we do this for all operands, not just loads? We should end up adding the i64->v1i64 copy in either case. dmgreen: Can we do this for all operands, not just loads? We should end up adding the i64->v1i64 copy in…
		mingminglAuthorUnsubmitted Done Reply Inline Actions Generalize by doing this for all operands, except when they are higher half of the SIMD register, and added comments to explain why. When operands are higher half, this tablegen pattern is optimal -> test_pmull_high_64 shows this. Also add test2 and test3 for this generalization https://gcc.godbolt.org/z/79sajTcf7 is codegen of trunk test2 has a load and an extract-element test3 has a load and a direct use of i64 Also I'd be glad to send an NFC patch of the test case first, and the diffs become more obvious with this patch. mingmingl: Generalize by doing this for all operands, except when they are higher half of the SIMD…

		dmgreenUnsubmitted Done Reply Inline Actions Op1.getValueType() dmgreen: Op1.getValueType()
		// As a general convention, use "v1" types to represent scalar integer
		// operations in vector registers. This helps ISel to make use of
		// tablegen patterns and generate a load into SIMD & FP registers directly.
		if (Op1.getValueType() == MVT::i64)
		dmgreenUnsubmitted Done Reply Inline Actions We can drop the brackets around single statements. dmgreen: We can drop the brackets around single statements.
		Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1);
		if (Op2.getValueType() == MVT::i64)
		Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2);

		return DAG.getNode(
		ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
		DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1,
		Op2);
		}
case Intrinsic::aarch64_neon_smax:		case Intrinsic::aarch64_neon_smax:
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),		return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umax:		case Intrinsic::aarch64_neon_umax:
return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),		return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_smin:		case Intrinsic::aarch64_neon_smin:
return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),		return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
▲ Show 20 Lines • Show All 17,408 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll

Show First 20 Lines • Show All 2,501 Lines • ▼ Show 20 Lines	entry:
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>		%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)		%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i		ret <8 x i16> %vmull.i.i
}		}

define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {		define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
; CHECK-LABEL: test_vmull_p64:		; CHECK-LABEL: test_vmull_p64:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, x0		; CHECK-NEXT: fmov d0, x1
; CHECK-NEXT: fmov d1, x1		; CHECK-NEXT: fmov d1, x0
; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d		; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-NEXT: mov x1, v0.d[1]		; CHECK-NEXT: mov x1, v0.d[1]
; CHECK-NEXT: fmov x0, d0		; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)		%vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
%vmull3.i = bitcast <16 x i8> %vmull2.i to i128		%vmull3.i = bitcast <16 x i8> %vmull2.i to i128
ret i128 %vmull3.i		ret i128 %vmull3.i
}		}
Show All 17 Lines

llvm/test/CodeGen/AArch64/arm64-vmul.ll

Show First 20 Lines • Show All 2,919 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)		%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
%tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)		%tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
ret i64 %tmp5		ret i64 %tmp5
}		}

define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {		define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
; CHECK-LABEL: test_pmull_64:		; CHECK-LABEL: test_pmull_64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0		; CHECK-NEXT: fmov d0, x1
; CHECK-NEXT: fmov d1, x1		; CHECK-NEXT: fmov d1, x0
; CHECK-NEXT: pmull.1q v0, v0, v1		; CHECK-NEXT: pmull.1q v0, v1, v0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)		%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
ret <16 x i8> %val		ret <16 x i8> %val
}		}

define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {		define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
; CHECK-LABEL: test_pmull_high_64:		; CHECK-LABEL: test_pmull_high_64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
Show All 23 Lines

llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s\| FileCheck %s --check-prefixes=CHECK

				dmgreenUnsubmitted Done Reply Inline Actions I think use -mattr=+aes. (Even is that is not technically correct, it is what the instruction uses) And it doesn't need CHECK-SDAG dmgreen: I think use -mattr=+aes. (Even is that is not technically correct, it is what the instruction…
				; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are
				; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov.

				define void @test1(ptr %0, i64 %1, i64 %2) {
				; CHECK-LABEL: test1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: add x8, x0, x2, lsl #4
				; CHECK-NEXT: add x9, x0, x1, lsl #4
				; CHECK-NEXT: ldr d0, [x8, #8]
				; CHECK-NEXT: ldr d1, [x9, #8]
				; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
				; CHECK-NEXT: str q0, [x9]
				; CHECK-NEXT: ret
				%4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
				%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
				%6 = load i64, ptr %5, align 8
				%7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1
				%8 = load i64, ptr %7, align 8
				%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8)
				store <16 x i8> %9, ptr %4, align 16
				ret void
				}

				define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
				; CHECK-LABEL: test2:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov x9, v0.d[1]
				; CHECK-NEXT: add x8, x0, x1, lsl #4
				; CHECK-NEXT: ldr d0, [x8, #8]
				; CHECK-NEXT: fmov d1, x9
				dmgreenUnsubmitted Done Reply Inline Actions The test could be simpler if it wasn't in a loop. dmgreen: The test could be simpler if it wasn't in a loop.
				; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
				; CHECK-NEXT: str q0, [x8]
				; CHECK-NEXT: ret
				%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
				%6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
				%7 = load i64, ptr %6, align 8
				%8 = extractelement <2 x i64> %3, i64 1
				%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8)
				store <16 x i8> %9, ptr %5, align 16
				ret void
				}

				define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
				; CHECK-LABEL: test3:
				; CHECK: // %bb.0:
				; CHECK-NEXT: add x8, x0, x1, lsl #4
				; CHECK-NEXT: fmov d0, x3
				; CHECK-NEXT: ldr d1, [x8, #8]
				; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
				; CHECK-NEXT: str q0, [x8]
				; CHECK-NEXT: ret
				%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
				%6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
				%7 = load i64, ptr %6, align 8
				%8 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %3)
				store <16 x i8> %8, ptr %5, align 16
				ret void
				}

				declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Explicitly use v1i64 type for llvm.aarch64.neon.pmull64
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 448102

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll

llvm/test/CodeGen/AArch64/arm64-vmul.ll

llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Explicitly use v1i64 type for llvm.aarch64.neon.pmull64ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 448102

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll

llvm/test/CodeGen/AArch64/arm64-vmul.ll

llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll

[AArch64] Explicitly use v1i64 type for llvm.aarch64.neon.pmull64
ClosedPublic