This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AArch64/AArch64InstrFormats.td
5626 ↗	(On Diff #325419)	As you know I don't mind nice and concise little patterns, but was wondering if we don't expect this simplification to happen earlier?

fhahn added inline comments.Feb 22 2021, 1:26 PM

llvm/lib/Target/AArch64/AArch64InstrFormats.td
5626 ↗	(On Diff #325419)	Not sure what the exact policy is, but `InstCombinerImpl::visitCallInst(` does optimize some target specific intrinsics. But I think this would be good to have for instruction selection in any case

dmgreen added inline comments.Feb 23 2021, 4:13 AM

llvm/lib/Target/AArch64/AArch64InstrFormats.td
5626 ↗	(On Diff #325419)	Oh, you mean pre-ISel? We lower a vecreduce.add(v16i8 x) to a vecreduce(udot(zero, one, x)), so this needs to be done sometime during ISel lowering at least. I'll add some tests for it. I can make it into a DAG combine. That should capture more cases without extra patterns, and should be simple enough I think.

Convert to a DAGCombine, with some vecreduce tests.

dmgreen added a child revision: D97279: [AArch64] Extend vecreduce -> udot handling to v8i8.Feb 23 2021, 6:18 AM

LGTM, thanks

This revision is now accepted and ready to land.Feb 23 2021, 8:16 AM

Closed by commit rG7abf7dd5efe2: [AArch64] Add combine for add(udot(0, x, y), z) -> udot(z, x, y). (authored by dmgreen). · Explain WhyMar 1 2021, 4:54 AM

This revision was automatically updated to reflect the committed changes.

dmgreen added a commit: rG7abf7dd5efe2: [AArch64] Add combine for add(udot(0, x, y), z) -> udot(z, x, y)..

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

25 lines

test/

CodeGen/

AArch64/

neon-dot-product.ll

52 lines

neon-dotreduce.ll

12 lines

Diff 327082

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 13,211 Lines • ▼ Show 20 Lines	static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
EVT ValVT = Val1->getValueType(0);		EVT ValVT = Val1->getValueType(0);
SDLoc DL(N);		SDLoc DL(N);
SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);		SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),		DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
DAG.getConstant(0, DL, MVT::i64));		DAG.getConstant(0, DL, MVT::i64));
}		}

		// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
		static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
		EVT VT = N->getValueType(0);
		if (N->getOpcode() != ISD::ADD)
		return SDValue();

		SDValue Dot = N->getOperand(0);
		SDValue A = N->getOperand(1);
		// Handle commutivity
		auto isZeroDot = [](SDValue Dot) {
		return (Dot.getOpcode() == AArch64ISD::UDOT \|\|
		Dot.getOpcode() == AArch64ISD::SDOT) &&
		ISD::isBuildVectorAllZeros(Dot.getOperand(0).getNode());
		};
		if (!isZeroDot(Dot))
		std::swap(Dot, A);
		if (!isZeroDot(Dot))
		return SDValue();

		return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
		Dot.getOperand(2));
		}

// The basic add/sub long vector instructions have variants with "2" on the end		// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by		// which act on the high-half of their inputs. They are normally matched by
// patterns like:		// patterns like:
//		//
// (add (zeroext (extract_high LHS)),		// (add (zeroext (extract_high LHS)),
// (zeroext (extract_high RHS)))		// (zeroext (extract_high RHS)))
// -> uaddl2 vD, vN, vM		// -> uaddl2 vD, vN, vM
//		//
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
}		}

static SDValue performAddSubCombine(SDNode *N,		static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
// Try to change sum of two reductions.		// Try to change sum of two reductions.
if (SDValue Val = performUADDVCombine(N, DAG))		if (SDValue Val = performUADDVCombine(N, DAG))
return Val;		return Val;
		if (SDValue Val = performAddDotCombine(N, DAG))
		return Val;

return performAddSubLongCombine(N, DCI, DAG);		return performAddSubLongCombine(N, DCI, DAG);
}		}

// Massage DAGs which we can use the high-half "long" operations on into		// Massage DAGs which we can use the high-half "long" operations on into
// something isel will recognize better. E.g.		// something isel will recognize better. E.g.
//		//
// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->		// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
▲ Show 20 Lines • Show All 4,041 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/neon-dot-product.ll

Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	entry:
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
ret <4 x i32> %vdot1.i		ret <4 x i32> %vdot1.i
}		}


define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {		define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
; CHECK-LABEL: test_vdot_u32_zero:		; CHECK-LABEL: test_vdot_u32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
; CHECK-NEXT: udot v3.2s, v1.8b, v2.8b
; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {		define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
; CHECK-LABEL: test_vdotq_u32_zero:		; CHECK-LABEL: test_vdotq_u32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
; CHECK-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {		define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
; CHECK-LABEL: test_vdot_s32_zero:		; CHECK-LABEL: test_vdot_s32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
; CHECK-NEXT: sdot v3.2s, v1.8b, v2.8b
; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {		define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
; CHECK-LABEL: test_vdotq_s32_zero:		; CHECK-LABEL: test_vdotq_s32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
; CHECK-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}


▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	entry:
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
ret <4 x i32> %vdot1.i		ret <4 x i32> %vdot1.i
}		}


define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {		define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
		; CHECK-LABEL: test_vdot_lane_u32_zero:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
		; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <8 x i8> %c to <2 x i32>		%.cast = bitcast <8 x i8> %c to <2 x i32>
%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>		%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>		%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {		define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
		; CHECK-LABEL: test_vdotq_lane_u32_zero:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
		; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <8 x i8> %c to <2 x i32>		%.cast = bitcast <8 x i8> %c to <2 x i32>
%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {		define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
; CHECK-LABEL: test_vdot_laneq_u32_zero:		; CHECK-LABEL: test_vdot_laneq_u32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
; CHECK-NEXT: udot v3.2s, v1.8b, v2.4b[1]
; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <16 x i8> %c to <4 x i32>		%.cast = bitcast <16 x i8> %c to <4 x i32>
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>		%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {		define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
; CHECK-LABEL: test_vdotq_laneq_u32_zero:		; CHECK-LABEL: test_vdotq_laneq_u32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
; CHECK-NEXT: udot v3.4s, v1.16b, v2.4b[1]
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <16 x i8> %c to <4 x i32>		%.cast = bitcast <16 x i8> %c to <4 x i32>
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	entry:
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
ret <4 x i32> %vdot1.i		ret <4 x i32> %vdot1.i
}		}


define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {		define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
		; CHECK-LABEL: test_vdot_lane_s32_zero:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
		; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <8 x i8> %c to <2 x i32>		%.cast = bitcast <8 x i8> %c to <2 x i32>
%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>		%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>		%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {		define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
		; CHECK-LABEL: test_vdotq_lane_s32_zero:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
		; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <8 x i8> %c to <2 x i32>		%.cast = bitcast <8 x i8> %c to <2 x i32>
%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {		define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
; CHECK-LABEL: test_vdot_laneq_s32_zero:		; CHECK-LABEL: test_vdot_laneq_s32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
; CHECK-NEXT: sdot v3.2s, v1.8b, v2.4b[1]
; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <16 x i8> %c to <4 x i32>		%.cast = bitcast <16 x i8> %c to <4 x i32>
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>		%.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2		%vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
%ret = add <2 x i32> %vdot1.i, %a		%ret = add <2 x i32> %vdot1.i, %a
ret <2 x i32> %ret		ret <2 x i32> %ret
}		}

define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {		define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
; CHECK-LABEL: test_vdotq_laneq_s32_zero:		; CHECK-LABEL: test_vdotq_laneq_s32_zero:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
; CHECK-NEXT: sdot v3.4s, v1.16b, v2.4b[1]
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%.cast = bitcast <16 x i8> %c to <4 x i32>		%.cast = bitcast <16 x i8> %c to <4 x i32>
%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>		%shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>		%.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2		%vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
%ret = add <4 x i32> %vdot1.i, %a		%ret = add <4 x i32> %vdot1.i, %a
ret <4 x i32> %ret		ret <4 x i32> %ret
}		}

llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Show First 20 Lines • Show All 267 Lines • ▼ Show 20 Lines	entry:
ret i32 %x		ret i32 %x
}		}

define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {		define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: test_udot_v16i8_double_nomla:		; CHECK-LABEL: test_udot_v16i8_double_nomla:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v1.16b, #1		; CHECK-NEXT: movi v1.16b, #1
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: udot v4.4s, v1.16b, v0.16b
; CHECK-NEXT: udot v3.4s, v1.16b, v2.16b		; CHECK-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-NEXT: add v0.4s, v4.4s, v3.4s		; CHECK-NEXT: udot v3.4s, v1.16b, v0.16b
; CHECK-NEXT: addv s0, v0.4s		; CHECK-NEXT: addv s0, v3.4s
; CHECK-NEXT: fmov w0, s0		; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%az = zext <16 x i8> %a to <16 x i32>		%az = zext <16 x i8> %a to <16 x i32>
%r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)		%r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
%cz = zext <16 x i8> %c to <16 x i32>		%cz = zext <16 x i8> %c to <16 x i32>
%r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)		%r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
%x = add i32 %r1, %r2		%x = add i32 %r1, %r2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	entry:
ret i32 %x		ret i32 %x
}		}

define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {		define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: test_sdot_v16i8_double_nomla:		; CHECK-LABEL: test_sdot_v16i8_double_nomla:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v1.16b, #1		; CHECK-NEXT: movi v1.16b, #1
; CHECK-NEXT: movi v3.2d, #0000000000000000		; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: sdot v4.4s, v1.16b, v0.16b
; CHECK-NEXT: sdot v3.4s, v1.16b, v2.16b		; CHECK-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-NEXT: add v0.4s, v4.4s, v3.4s		; CHECK-NEXT: sdot v3.4s, v1.16b, v0.16b
; CHECK-NEXT: addv s0, v0.4s		; CHECK-NEXT: addv s0, v3.4s
; CHECK-NEXT: fmov w0, s0		; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%az = sext <16 x i8> %a to <16 x i32>		%az = sext <16 x i8> %a to <16 x i32>
%r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)		%r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
%cz = sext <16 x i8> %c to <16 x i32>		%cz = sext <16 x i8> %c to <16 x i32>
%r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)		%r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
%x = add i32 %r1, %r2		%x = add i32 %r1, %r2
ret i32 %x		ret i32 %x
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Add patterns for add(udot(0, x, y), z) -> udot(z, x, y).ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 327082

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/neon-dot-product.ll

llvm/test/CodeGen/AArch64/neon-dotreduce.ll

[AArch64] Add patterns for add(udot(0, x, y), z) -> udot(z, x, y).
ClosedPublic