This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
include/clang/Basic/
-
clang/
-
Basic/
-
arm_mve.td
-
test/CodeGen/arm-mve-intrinsics/
-
CodeGen/
-
arm-mve-intrinsics/
-
ternary.c
-
llvm/
-
include/llvm/IR/
-
llvm/
-
IR/
-
IntrinsicsARM.td
-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMISelLowering.h
1/2
ARMISelLowering.cpp
1/1
ARMInstrMVE.td
-
test/CodeGen/Thumb2/mve-intrinsics/
-
CodeGen/
-
Thumb2/
-
mve-intrinsics/
-
ternary.ll

Differential D76122

[ARM,MVE] Add intrinsics and isel for MVE integer VMLA.
ClosedPublic

Authored by simon_tatham on Mar 13 2020, 2:34 AM.

Download Raw Diff

Details

Reviewers

dmgreen
MarkMurrayARM
miyuki
ostannard

Commits

rG28c5d97beec7: [ARM,MVE] Add intrinsics and isel for MVE integer VMLA.

Summary

These instructions compute multiply+add in integers, with one of the
operands being a splat of a scalar. (VMLA and VMLAS differ in whether
the splat operand is a multiplier or the addend.)

I've represented these in IR using existing standard IR operations for
the unpredicated forms. The predicated forms are done with target-
specific intrinsics, as usual.

When operating on n-bit vector lanes, only the bottom n bits of the
i32 scalar operand are used. So we have to tell that to isel lowering,
to allow it to remove a pointless sign- or zero-extension instruction
on that input register. That's done in PerformIntrinsicCombine, but
first I had to enable PerformIntrinsicCombine for MVE targets
(previously all the intrinsics it handled were for NEON), and make it
a method of ARMTargetLowering so that it can get at
SimplifyDemandedBits.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

simon_tatham created this revision.Mar 13 2020, 2:34 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 13 2020, 2:34 AM

Herald added subscribers: cfe-commits, danielkiss, hiraditya, kristof.beyls. · View Herald Transcript

Harbormaster failed remote builds in B49112: Diff 250156!Mar 13 2020, 3:41 AM

Fix clang-format warnings.

(Is it really sensible to require even test input files, which are often autogenerated, to be fixed points of clang-format?)

Harbormaster failed remote builds in B49123: Diff 250174!Mar 13 2020, 5:06 AM

LGTM

llvm/lib/Target/ARM/ARMISelLowering.cpp
14289–14292	I know it's not part of this patch, but what is this code for?
llvm/lib/Target/ARM/ARMInstrMVE.td
5172	Can you make it so that patterns are: def : Pat<(VTI.Vec (add (mul v1, v2), vs)), (VTI.Vec (Inst v1, v2, s))>; So they are easier to read.

This revision is now accepted and ready to land.Mar 16 2020, 7:38 AM

simon_tatham marked an inline comment as done.Mar 16 2020, 10:14 AM

simon_tatham added inline comments.

llvm/lib/Target/ARM/ARMISelLowering.cpp
14289–14292	I have no idea! They date from rG2e076c4e02fb99 in 2009 which first introduced NEON support, and even then, the switch they appear in had a `default: break` clause. My best guess is that they're intended as a kind of documentation. They signal to the reader: "I didn't forget about these intrinsics, I carefully considered them, decided they didn't need handling here, and included a comment saying why not."

Closed by commit rG28c5d97beec7: [ARM,MVE] Add intrinsics and isel for MVE integer VMLA. (authored by simon_tatham). · Explain WhyMar 18 2020, 4:19 AM

This revision was automatically updated to reflect the committed changes.

simon_tatham marked an inline comment as done.

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

arm_mve.td

18 lines

test/

CodeGen/

arm-mve-intrinsics/

ternary.c

380 lines

llvm/

include/

llvm/

IR/

IntrinsicsARM.td

8 lines

lib/

Target/

ARM/

ARMISelLowering.h

1 line

ARMISelLowering.cpp

22 lines

ARMInstrMVE.td

77 lines

test/

CodeGen/

Thumb2/

mve-intrinsics/

ternary.ll

345 lines

Diff 250174

clang/include/clang/Basic/arm_mve.td

Show First 20 Lines • Show All 196 Lines • ▼ Show 20 Lines	multiclass FMA<bit add> {
}		}
}		}

let params = T.Float in {		let params = T.Float in {
defm vfma: FMA<1>;		defm vfma: FMA<1>;
defm vfms: FMA<0>;		defm vfms: FMA<0>;
}		}

		let params = T.Int, pnt = PNT_NType in {
		def vmlaq_n: Intrinsic<
		Vector, (args Vector:$addend, Vector:$m1, unpromoted<Scalar>:$m2_s),
		(add (mul $m1, (splat $m2_s)), $addend)>;
		def vmlasq_n: Intrinsic<
		Vector, (args Vector:$m1, Vector:$m2, unpromoted<Scalar>:$addend_s),
		(add (mul $m1, $m2), (splat $addend_s))>;

		def vmlaq_m_n: Intrinsic<
		Vector, (args Vector:$addend, Vector:$m1, Scalar:$m2_s, Predicate:$pred),
		(IRInt<"vmla_n_predicated", [Vector, Predicate]>
		$addend, $m1, $m2_s, $pred)>;
		def vmlasq_m_n: Intrinsic<
		Vector, (args Vector:$m1, Vector:$m2, Scalar:$addend_s, Predicate:$pred),
		(IRInt<"vmlas_n_predicated", [Vector, Predicate]>
		$m1, $m2, $addend_s, $pred)>;
		}

let params = !listconcat(T.Int16, T.Int32) in {		let params = !listconcat(T.Int16, T.Int32) in {
let pnt = PNT_None in {		let pnt = PNT_None in {
def vmvnq_n: Intrinsic<Vector, (args imm_simd_vmvn:$imm),		def vmvnq_n: Intrinsic<Vector, (args imm_simd_vmvn:$imm),
(not (splat (Scalar $imm)))>;		(not (splat (Scalar $imm)))>;
}		}
defm vmvnq: IntrinsicMX<Vector, (args imm_simd_vmvn:$imm, Predicate:$pred),		defm vmvnq: IntrinsicMX<Vector, (args imm_simd_vmvn:$imm, Predicate:$pred),
(select $pred, (not (splat (Scalar $imm))), $inactive),		(select $pred, (not (splat (Scalar $imm))), $inactive),
1, "_n", PNT_NType, PNT_None>;		1, "_n", PNT_NType, PNT_None>;
▲ Show 20 Lines • Show All 1,278 Lines • Show Last 20 Lines

clang/test/CodeGen/arm-mve-intrinsics/ternary.c

	Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines
	float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {			float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
	#ifdef POLYMORPHIC			#ifdef POLYMORPHIC
	return vfmsq(a, b, c);			return vfmsq(a, b, c);
	#else /* POLYMORPHIC */			#else /* POLYMORPHIC */
	return vfmsq_f32(a, b, c);			return vfmsq_f32(a, b, c);
	#endif /* POLYMORPHIC */			#endif /* POLYMORPHIC */
	}			}

				// CHECK-LABEL: @test_vmlaq_n_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <16 x i8> undef, i8 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <16 x i8> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <16 x i8> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <16 x i8> [[TMP1]]
				//
				int8x16_t test_vmlaq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_s8(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_n_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <8 x i16> undef, i16 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <8 x i16> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <8 x i16> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <8 x i16> [[TMP1]]
				//
				int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_s16(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_n_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <4 x i32> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <4 x i32> [[TMP1]]
				//
				int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_s32(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_n_u8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <16 x i8> undef, i8 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <16 x i8> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <16 x i8> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <16 x i8> [[TMP1]]
				//
				uint8x16_t test_vmlaq_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_u8(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_n_u16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <8 x i16> undef, i16 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <8 x i16> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <8 x i16> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <8 x i16> [[TMP1]]
				//
				uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_u16(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_n_u32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP0:%.]] = mul <4 x i32> [[B:%.]], [[DOTSPLAT]]
				// CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[TMP0]], [[A:%.]]
				// CHECK-NEXT: ret <4 x i32> [[TMP1]]
				//
				uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
				#ifdef POLYMORPHIC
				return vmlaq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlaq_n_u32(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <16 x i8> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <16 x i8> undef, i8 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <16 x i8> [[TMP1]]
				//
				int8x16_t test_vmlasq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_s8(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <8 x i16> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <8 x i16> undef, i16 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <8 x i16> [[TMP1]]
				//
				int16x8_t test_vmlasq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_s16(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <4 x i32> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <4 x i32> [[TMP1]]
				//
				int32x4_t test_vmlasq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_s32(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_u8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <16 x i8> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <16 x i8> undef, i8 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <16 x i8> [[TMP1]]
				//
				uint8x16_t test_vmlasq_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_u8(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_u16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <8 x i16> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <8 x i16> undef, i16 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <8 x i16> [[TMP1]]
				//
				uint16x8_t test_vmlasq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_u16(a, b, c);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_n_u32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = mul <4 x i32> [[A:%.]], [[B:%.*]]
				// CHECK-NEXT: [[DOTSPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[C:%.]], i32 0
				// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[DOTSPLAT]]
				// CHECK-NEXT: ret <4 x i32> [[TMP1]]
				//
				uint32x4_t test_vmlasq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
				#ifdef POLYMORPHIC
				return vmlasq(a, b, c);
				#else /* POLYMORPHIC */
				return vmlasq_n_u32(a, b, c);
				#endif /* POLYMORPHIC */
				}

	// CHECK-LABEL: @test_vfmaq_m_f16(			// CHECK-LABEL: @test_vfmaq_m_f16(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32			// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
	// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])			// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
	// CHECK-NEXT: [[TMP2:%.]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.]], <8 x half> [[C:%.]], <8 x half> [[A:%.]], <8 x i1> [[TMP1]])			// CHECK-NEXT: [[TMP2:%.]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.]], <8 x half> [[C:%.]], <8 x half> [[A:%.]], <8 x i1> [[TMP1]])
	// CHECK-NEXT: ret <8 x half> [[TMP2]]			// CHECK-NEXT: ret <8 x half> [[TMP2]]
	//			//
	float16x8_t test_vfmaq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {			float16x8_t test_vfmaq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {
	▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
	//			//
	float32x4_t test_vfmsq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {			float32x4_t test_vfmsq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {
	#ifdef POLYMORPHIC			#ifdef POLYMORPHIC
	return vfmsq_m(a, b, c, p);			return vfmsq_m(a, b, c, p);
	#else /* POLYMORPHIC */			#else /* POLYMORPHIC */
	return vfmsq_m_f32(a, b, c, p);			return vfmsq_m_f32(a, b, c, p);
	#endif /* POLYMORPHIC */			#endif /* POLYMORPHIC */
	}			}

				// CHECK-LABEL: @test_vmlaq_m_n_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i8 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
				// CHECK-NEXT: ret <16 x i8> [[TMP3]]
				//
				int8x16_t test_vmlaq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_s8(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_m_n_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
				// CHECK-NEXT: ret <8 x i16> [[TMP3]]
				//
				int16x8_t test_vmlaq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_s16(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_m_n_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.]], <4 x i32> [[A:%.]], i32 [[C:%.]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				int32x4_t test_vmlaq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_s32(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_m_n_u8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i8 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
				// CHECK-NEXT: ret <16 x i8> [[TMP3]]
				//
				uint8x16_t test_vmlaq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_u8(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_m_n_u16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
				// CHECK-NEXT: ret <8 x i16> [[TMP3]]
				//
				uint16x8_t test_vmlaq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_u16(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlaq_m_n_u32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.]], <4 x i32> [[A:%.]], i32 [[C:%.]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				uint32x4_t test_vmlaq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlaq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlaq_m_n_u32(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i8 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
				// CHECK-NEXT: ret <16 x i8> [[TMP3]]
				//
				int8x16_t test_vmlasq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_s8(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
				// CHECK-NEXT: ret <8 x i16> [[TMP3]]
				//
				int16x8_t test_vmlasq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_s16(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.]], <4 x i32> [[B:%.]], i32 [[C:%.]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				int32x4_t test_vmlasq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_s32(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_u8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i8 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
				// CHECK-NEXT: ret <16 x i8> [[TMP3]]
				//
				uint8x16_t test_vmlasq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_u8(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_u16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[C:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
				// CHECK-NEXT: [[TMP3:%.]] = call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
				// CHECK-NEXT: ret <8 x i16> [[TMP3]]
				//
				uint16x8_t test_vmlasq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_u16(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmlasq_m_n_u32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.]], <4 x i32> [[B:%.]], i32 [[C:%.]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				uint32x4_t test_vmlasq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) {
				#ifdef POLYMORPHIC
				return vmlasq_m(a, b, c, p);
				#else /* POLYMORPHIC */
				return vmlasq_m_n_u32(a, b, c, p);
				#endif /* POLYMORPHIC */
				}

llvm/include/llvm/IR/IntrinsicsARM.td

	Show First 20 Lines • Show All 1,240 Lines • ▼ Show 20 Lines
	def int_arm_mve_vqmovn_predicated: Intrinsic<[llvm_anyvector_ty],			def int_arm_mve_vqmovn_predicated: Intrinsic<[llvm_anyvector_ty],
	[LLVMMatchType<0>, llvm_anyvector_ty,			[LLVMMatchType<0>, llvm_anyvector_ty,
	llvm_i32_ty /* unsigned output /, llvm_i32_ty / unsigned input */,			llvm_i32_ty /* unsigned output /, llvm_i32_ty / unsigned input */,
	llvm_i32_ty /* top half /, llvm_anyvector_ty / pred */], [IntrNoMem]>;			llvm_i32_ty /* top half /, llvm_anyvector_ty / pred */], [IntrNoMem]>;

	def int_arm_mve_fma_predicated: Intrinsic<[llvm_anyvector_ty],			def int_arm_mve_fma_predicated: Intrinsic<[llvm_anyvector_ty],
	[LLVMMatchType<0> /* mult op #1 /, LLVMMatchType<0> / mult op #2 */,			[LLVMMatchType<0> /* mult op #1 /, LLVMMatchType<0> / mult op #2 */,
	LLVMMatchType<0> /* addend /, llvm_anyvector_ty / pred */], [IntrNoMem]>;			LLVMMatchType<0> /* addend /, llvm_anyvector_ty / pred */], [IntrNoMem]>;
				def int_arm_mve_vmla_n_predicated: Intrinsic<[llvm_anyvector_ty],
				[LLVMMatchType<0> /* mult op #1 /, LLVMMatchType<0> / addend */,
				llvm_i32_ty /* mult op #2 (scalar) /, llvm_anyvector_ty / pred */],
				[IntrNoMem]>;
				def int_arm_mve_vmlas_n_predicated: Intrinsic<[llvm_anyvector_ty],
				[LLVMMatchType<0> /* mult op #1 /, LLVMMatchType<0> / mult op #2 */,
				llvm_i32_ty /* addend (scalar) /, llvm_anyvector_ty / pred */],
				[IntrNoMem]>;

	// CDE (Custom Datapath Extension)			// CDE (Custom Datapath Extension)

	def int_arm_cde_cx1: Intrinsic<			def int_arm_cde_cx1: Intrinsic<
	[llvm_i32_ty],			[llvm_i32_ty],
	[llvm_i32_ty /* coproc /, llvm_i32_ty / imm */],			[llvm_i32_ty /* coproc /, llvm_i32_ty / imm */],
	[IntrNoMem, ImmArg<0>, ImmArg<1>]>;			[IntrNoMem, ImmArg<0>, ImmArg<1>]>;

	} // end TargetPrefix			} // end TargetPrefix

llvm/lib/Target/ARM/ARMISelLowering.h

Show First 20 Lines • Show All 346 Lines • ▼ Show 20 Lines	EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;		MachineBasicBlock *MBB) const override;

void AdjustInstrPostInstrSelection(MachineInstr &MI,		void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;		SDNode *Node) const override;

SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;		SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;		SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;		SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
		SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;		SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;		bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;

/// allowsMisalignedMemoryAccesses - Returns true if the target allows		/// allowsMisalignedMemoryAccesses - Returns true if the target allows
/// unaligned memory accesses of the specified type. Returns whether it		/// unaligned memory accesses of the specified type. Returns whether it
/// is "fast" by reference in the second argument.		/// is "fast" by reference in the second argument.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,		bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
▲ Show 20 Lines • Show All 525 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 905 Lines • ▼ Show 20 Lines	if (Subtarget->hasNEON()) {
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);		setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);

// NEON only has FMA instructions as of VFP4.		// NEON only has FMA instructions as of VFP4.
if (!Subtarget->hasVFP4Base()) {		if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);		setOperationAction(ISD::FMA, MVT::v2f32, Expand);
setOperationAction(ISD::FMA, MVT::v4f32, Expand);		setOperationAction(ISD::FMA, MVT::v4f32, Expand);
}		}

setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);		setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);		setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SRA);		setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::FP_TO_SINT);		setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);		setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);		setTargetDAGCombine(ISD::FDIV);
setTargetDAGCombine(ISD::LOAD);		setTargetDAGCombine(ISD::LOAD);

Show All 11 Lines	ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasNEON() \|\| Subtarget->hasMVEIntegerOps()) {		if (Subtarget->hasNEON() \|\| Subtarget->hasMVEIntegerOps()) {
setTargetDAGCombine(ISD::BUILD_VECTOR);		setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);		setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);		setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::STORE);		setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::SIGN_EXTEND);		setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);		setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);		setTargetDAGCombine(ISD::ANY_EXTEND);
		setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);		setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_VOID);		setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::VECREDUCE_ADD);		setTargetDAGCombine(ISD::VECREDUCE_ADD);
setTargetDAGCombine(ISD::ADD);		setTargetDAGCombine(ISD::ADD);
}		}

if (!Subtarget->hasFP64()) {		if (!Subtarget->hasFP64()) {
// When targeting a floating-point unit with only single-precision		// When targeting a floating-point unit with only single-precision
▲ Show 20 Lines • Show All 13,184 Lines • ▼ Show 20 Lines	static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))		if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
return Create64bitNode(ARMISD::VMLALVs, {A, B});		return Create64bitNode(ARMISD::VMLALVs, {A, B});
if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))		if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
return Create64bitNode(ARMISD::VMLALVu, {A, B});		return Create64bitNode(ARMISD::VMLALVu, {A, B});
return SDValue();		return SDValue();
}		}

/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.		/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {		SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
		DAGCombinerInfo &DCI) const {
		SelectionDAG &DAG = DCI.DAG;
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();		unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IntNo) {		switch (IntNo) {
default:		default:
// Don't do anything for most intrinsics.		// Don't do anything for most intrinsics.
break;		break;

// Vector shifts: check for immediate versions and lower them.		// Vector shifts: check for immediate versions and lower them.
// Note: This is done during DAG combining instead of DAG legalizing because		// Note: This is done during DAG combining instead of DAG legalizing because
▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	case Intrinsic::arm_neon_vshiftins: {
}		}

SDLoc dl(N);		SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, N->getValueType(0),		return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
N->getOperand(1), N->getOperand(2),		N->getOperand(1), N->getOperand(2),
DAG.getConstant(Cnt, dl, MVT::i32));		DAG.getConstant(Cnt, dl, MVT::i32));
}		}

case Intrinsic::arm_neon_vqrshifts:		case Intrinsic::arm_neon_vqrshifts:
case Intrinsic::arm_neon_vqrshiftu:		case Intrinsic::arm_neon_vqrshiftu:
// No immediate versions of these to check for.		// No immediate versions of these to check for.
break;		break;
		dmgreenUnsubmitted Not Done Reply Inline Actions I know it's not part of this patch, but what is this code for? dmgreen: I know it's not part of this patch, but what is this code for?
		simon_tathamAuthorUnsubmitted Done Reply Inline Actions I have no idea! They date from rG2e076c4e02fb99 in 2009 which first introduced NEON support, and even then, the switch they appear in had a `default: break` clause. My best guess is that they're intended as a kind of documentation. They signal to the reader: "I didn't forget about these intrinsics, I carefully considered them, decided they didn't need handling here, and included a comment saying why not." simon_tatham: I have no idea! They date from rG2e076c4e02fb99 in 2009 which first introduced NEON support…

		case Intrinsic::arm_mve_vmla_n_predicated:
		case Intrinsic::arm_mve_vmlas_n_predicated: {
		// These intrinsics all take an i32 scalar operand which is narrowed to the
		// size of a single lane of the vector type they return. So we don't need
		// any bits of that operand above that point, which allows us to eliminate
		// uxth/sxth.
		unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
		APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
		if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
		return SDValue();
		break;
		}
}		}

return SDValue();		return SDValue();
}		}

/// PerformShiftCombine - Checks for immediate versions of vector shifts and		/// PerformShiftCombine - Checks for immediate versions of vector shifts and
/// lowers them. As with the vector shift intrinsics, this is done during DAG		/// lowers them. As with the vector shift intrinsics, this is done during DAG
/// combining instead of DAG legalizing because the build_vectors for 64-bit		/// combining instead of DAG legalizing because the build_vectors for 64-bit
▲ Show 20 Lines • Show All 703 Lines • ▼ Show 20 Lines	SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);		case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);		case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);		case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:		case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:		case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);		return PerformVCVTCombine(N, DCI.DAG, Subtarget);
case ISD::FDIV:		case ISD::FDIV:
return PerformVDIVCombine(N, DCI.DAG, Subtarget);		return PerformVDIVCombine(N, DCI.DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);		case ISD::INTRINSIC_WO_CHAIN:
		return PerformIntrinsicCombine(N, DCI);
case ISD::SHL:		case ISD::SHL:
case ISD::SRA:		case ISD::SRA:
case ISD::SRL:		case ISD::SRL:
return PerformShiftCombine(N, DCI, Subtarget);		return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:		case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:		case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);		case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);		case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
▲ Show 20 Lines • Show All 2,866 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrMVE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,146 Lines • ▼ Show 20 Lines	class MVE_VFMAMLA_qr<string iname, string suffix,
let Inst{21-20} = bits_21_20;		let Inst{21-20} = bits_21_20;
let Inst{16} = 0b1;		let Inst{16} = 0b1;
let Inst{12} = S;		let Inst{12} = S;
let Inst{8} = 0b0;		let Inst{8} = 0b0;
let Inst{5} = 0b0;		let Inst{5} = 0b0;
let validForTailPredication = 1;		let validForTailPredication = 1;
}		}

def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;		multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>;		bit scalar_addend> {
def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>;		def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>;		scalar_addend>;
def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>;		defvar Inst = !cast<Instruction>(NAME);
def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>;		defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated");
		defvar v1 = (VTI.Vec MQPR:$v1);
def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>;		defvar v2 = (VTI.Vec MQPR:$v2);
def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>;		defvar vs = (VTI.Vec (ARMvdup rGPR:$s));
def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>;		defvar s = (i32 rGPR:$s);
def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;		defvar pred = (VTI.Pred VCCR:$pred);
def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;		// The signed and unsigned variants of this instruction have different
		// encodings, but they're functionally identical. For the sake of
let Predicates = [HasMVEInt] in {		// determinism, we generate only the unsigned variant.
def : Pat<(v4i32 (add (v4i32 MQPR:$src1),		if VTI.Unsigned then let Predicates = [HasMVEInt] in {
(v4i32 (mul (v4i32 MQPR:$src2),		if scalar_addend then {
(v4i32 (ARMvdup (i32 rGPR:$x))))))),		def : Pat<(VTI.Vec (add (mul v1, v2), vs)), (VTI.Vec (Inst v1, v2, s))>;
		dmgreenUnsubmitted Done Reply Inline Actions Can you make it so that patterns are: def : Pat<(VTI.Vec (add (mul v1, v2), vs)), (VTI.Vec (Inst v1, v2, s))>; So they are easier to read. dmgreen: Can you make it so that patterns are: def : Pat<(VTI.Vec (add (mul v1, v2), vs))…
(v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;		} else {
def : Pat<(v8i16 (add (v8i16 MQPR:$src1),		def : Pat<(VTI.Vec (add (mul v2, vs), v1)), (VTI.Vec (Inst v1, v2, s))>;
(v8i16 (mul (v8i16 MQPR:$src2),		}
(v8i16 (ARMvdup (i32 rGPR:$x))))))),
(v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;		def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)),
def : Pat<(v16i8 (add (v16i8 MQPR:$src1),		(VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>;
(v16i8 (mul (v16i8 MQPR:$src2),
(v16i8 (ARMvdup (i32 rGPR:$x))))))),
(v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;

def : Pat<(v4i32 (add (v4i32 (ARMvdup (i32 rGPR:$x))),
(v4i32 (mul (v4i32 MQPR:$src1),
(v4i32 MQPR:$src2))))),
(v4i32 (MVE_VMLAS_qr_u32 $src1, $src2, $x))>;
def : Pat<(v8i16 (add (v8i16 (ARMvdup (i32 rGPR:$x))),
(v8i16 (mul (v8i16 MQPR:$src1),
(v8i16 MQPR:$src2))))),
(v8i16 (MVE_VMLAS_qr_u16 $src1, $src2, $x))>;
def : Pat<(v16i8 (add (v16i8 (ARMvdup (i32 rGPR:$x))),
(v16i8 (mul (v16i8 MQPR:$src1),
(v16i8 MQPR:$src2))))),
(v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>;
}		}
		}

		defm MVE_VMLA_qr_s8 : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>;
		defm MVE_VMLA_qr_s16 : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>;
		defm MVE_VMLA_qr_s32 : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>;
		defm MVE_VMLA_qr_u8 : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>;
		defm MVE_VMLA_qr_u16 : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>;
		defm MVE_VMLA_qr_u32 : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>;

		defm MVE_VMLAS_qr_s8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>;
		defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>;
		defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>;
		defm MVE_VMLAS_qr_u8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>;
		defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>;
		defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>;

multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,		multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
bit scalar_addend> {		bit scalar_addend> {
def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;		def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
defvar Inst = !cast<Instruction>(NAME);		defvar Inst = !cast<Instruction>(NAME);
defvar pred_int = int_arm_mve_fma_predicated;		defvar pred_int = int_arm_mve_fma_predicated;
defvar v1 = (VTI.Vec MQPR:$v1);		defvar v1 = (VTI.Vec MQPR:$v1);
defvar v2 = (VTI.Vec MQPR:$v2);		defvar v2 = (VTI.Vec MQPR:$v2);
▲ Show 20 Lines • Show All 1,734 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll

	Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: vfms.f32 q0, q1, q2			; CHECK-NEXT: vfms.f32 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = fneg <4 x float> %c			%0 = fneg <4 x float> %c
	%1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %0, <4 x float> %a)			%1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %0, <4 x float> %a)
	ret <4 x float> %1			ret <4 x float> %1
	}			}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
				; CHECK-LABEL: test_vmlaq_n_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
				%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
				%0 = mul <16 x i8> %.splat, %b
				%1 = add <16 x i8> %0, %a
				ret <16 x i8> %1
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
				; CHECK-LABEL: test_vmlaq_n_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
				%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
				%0 = mul <8 x i16> %.splat, %b
				%1 = add <8 x i16> %0, %a
				ret <8 x i16> %1
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
				; CHECK-LABEL: test_vmlaq_n_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
				%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%0 = mul <4 x i32> %.splat, %b
				%1 = add <4 x i32> %0, %a
				ret <4 x i32> %1
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) {
				; CHECK-LABEL: test_vmlaq_n_u8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
				%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
				%0 = mul <16 x i8> %.splat, %b
				%1 = add <16 x i8> %0, %a
				ret <16 x i8> %1
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) {
				; CHECK-LABEL: test_vmlaq_n_u16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
				%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
				%0 = mul <8 x i16> %.splat, %b
				%1 = add <8 x i16> %0, %a
				ret <8 x i16> %1
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
				; CHECK-LABEL: test_vmlaq_n_u32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmla.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
				%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%0 = mul <4 x i32> %.splat, %b
				%1 = add <4 x i32> %0, %a
				ret <4 x i32> %1
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
				; CHECK-LABEL: test_vmlasq_n_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u8 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <16 x i8> %b, %a
				%.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
				%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
				%1 = add <16 x i8> %.splat, %0
				ret <16 x i8> %1
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
				; CHECK-LABEL: test_vmlasq_n_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u16 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <8 x i16> %b, %a
				%.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
				%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
				%1 = add <8 x i16> %.splat, %0
				ret <8 x i16> %1
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
				; CHECK-LABEL: test_vmlasq_n_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u32 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <4 x i32> %b, %a
				%.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
				%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%1 = add <4 x i32> %.splat, %0
				ret <4 x i32> %1
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) {
				; CHECK-LABEL: test_vmlasq_n_u8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u8 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <16 x i8> %b, %a
				%.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
				%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
				%1 = add <16 x i8> %.splat, %0
				ret <16 x i8> %1
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) {
				; CHECK-LABEL: test_vmlasq_n_u16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u16 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <8 x i16> %b, %a
				%.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
				%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
				%1 = add <8 x i16> %.splat, %0
				ret <8 x i16> %1
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
				; CHECK-LABEL: test_vmlasq_n_u32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmlas.u32 q1, q0, r0
				; CHECK-NEXT: vmov q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = mul <4 x i32> %b, %a
				%.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
				%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%1 = add <4 x i32> %.splat, %0
				ret <4 x i32> %1
				}

	define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {			define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {
	; CHECK-LABEL: test_vfmaq_m_f16:			; CHECK-LABEL: test_vfmaq_m_f16:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmsr p0, r0			; CHECK-NEXT: vmsr p0, r0
	; CHECK-NEXT: vpst			; CHECK-NEXT: vpst
	; CHECK-NEXT: vfmat.f16 q0, q1, q2			; CHECK-NEXT: vfmat.f16 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines
	entry:			entry:
	%0 = fneg <4 x float> %c			%0 = fneg <4 x float> %c
	%1 = zext i16 %p to i32			%1 = zext i16 %p to i32
	%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)			%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
	%3 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %0, <4 x float> %a, <4 x i1> %2)			%3 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %0, <4 x float> %a, <4 x i1> %2)
	ret <4 x float> %3			ret <4 x float> %3
	}			}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i8 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
				%3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
				ret <16 x i8> %3
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
				%3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
				ret <8 x i16> %3
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
				ret <4 x i32> %2
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_u8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i8 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
				%3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
				ret <16 x i8> %3
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_u16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
				%3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
				ret <8 x i16> %3
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlaq_m_n_u32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlat.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
				ret <4 x i32> %2
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i8 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
				%3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
				ret <16 x i8> %3
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
				%3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
				ret <8 x i16> %3
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
				ret <4 x i32> %2
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_u8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u8 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i8 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
				%3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
				ret <16 x i8> %3
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_u16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u16 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %c to i32
				%1 = zext i16 %p to i32
				%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
				%3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
				ret <8 x i16> %3
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
				; CHECK-LABEL: test_vmlasq_m_n_u32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r1
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmlast.u32 q0, q1, r0
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
				ret <4 x i32> %2
				}

				declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
	declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)			declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
	declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)			declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)

	declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)			declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
	declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)			declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
	declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>)			declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>)
	declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)			declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
				declare <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
				declare <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
				declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
				declare <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
				declare <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
				declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)