This is an archive of the discontinued LLVM Phabricator instance.

clang/include/clang/Basic/arm_mve.td
289	I wonder if we should implement at least the simple case (integer and unpredicated) using standard IR nodes instead of an IR intrinsic? We already implement `vmaxq` using an icmp and a select. We haven't implemented `vabsq` yet, but when we do, it will surely be done in a similar way, to take advantage of the existing pattern matching showcased in `llvm/test/CodeGen/Thumb2/mve-abs.ll`. So possibly we should code-generate `vmaxaq(a,b)` as if it was `vmaxq(a, vabsq(b))`, and write a more complicated isel pattern that will match that whole tree? The advantage would be that if a user had literally written a combination of `vmaxq` and `vabsq`, codegen would be able to fold them together into a single instruction at compile time. The FP versions might make sense to do the same way, using the standard `@llvm.fabs` IR intrinsic for the abs part.

Much better impementation after review feedback.

MarkMurrayARM marked an inline comment as done.Jan 15 2020, 8:18 AM

Harbormaster completed remote builds in B44052: Diff 238269.Jan 15 2020, 8:21 AM

LGTM this time. Thanks for the rewrite!

This revision is now accepted and ready to land.Jan 15 2020, 8:39 AM

Closed by commit rGda9d57d2c2dc: [ARM][MVE][Intrinsics] Add VMINAQ, VMINNMAQ, VMAXAQ, VMAXNMAQ intrinsics. (authored by MarkMurrayARM). · Explain WhyJan 15 2020, 9:25 AM

This revision was automatically updated to reflect the committed changes.

Nice one. Good to see codegen changes coming out of these intrinsics.

It took a while for me to figure out what the integer instruction was doing. That's a strange one.

The fp case I have a question about below.

llvm/lib/Target/ARM/ARMInstrMVE.td
3658	If I'm reading the ARMARM correctly, the fp case seems to preform the abs on both operands.

MarkMurrayARM marked an inline comment as done.Jan 16 2020, 1:50 AM

MarkMurrayARM added inline comments.

llvm/lib/Target/ARM/ARMInstrMVE.td
3658	My bad. Fix coming under separate cover.

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

arm_mve.td

26 lines

test/

CodeGen/

arm-mve-intrinsics/

107 lines

67 lines

107 lines

67 lines

llvm/

include/

llvm/

IR/

IntrinsicsARM.td

12 lines

lib/

Target/

ARM/

ARMInstrMVE.td

69 lines

test/

CodeGen/

Thumb2/

mve-intrinsics/

98 lines

68 lines

98 lines

68 lines

Diff 238295

clang/include/clang/Basic/arm_mve.td

Show First 20 Lines • Show All 186 Lines • ▼ Show 20 Lines	let params = T.Int in {
defm vqsubq : VectorVectorArithmetic<"qsub_predicated", (? (unsignedflag Scalar)), 0>;		defm vqsubq : VectorVectorArithmetic<"qsub_predicated", (? (unsignedflag Scalar)), 0>;
defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;		defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;
defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;		defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;
defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;		defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;
}		}
let params = T.Signed in {		let params = T.Signed in {
defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;		defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;
defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", (?), 0>;		defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", (?), 0>;
		def vminaq_m: Intrinsic<UVector, (args UVector:$a, Vector:$b, Predicate:$pred),
		(IRInt<"vmina_predicated", [UVector,Predicate]> $a, $b, $pred)>;
		def vmaxaq_m: Intrinsic<UVector, (args UVector:$a, Vector:$b, Predicate:$pred),
		(IRInt<"vmaxa_predicated", [UVector,Predicate]> $a, $b, $pred)>;
}		}

let params = T.Poly, overrideKindLetter = "p" in {		let params = T.Poly, overrideKindLetter = "p" in {
defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 0))>;		defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 0))>;
defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>;		defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>;
}		}

// Predicated intrinsics - Float types only		// Predicated intrinsics - Float types only
let params = T.Float in {		let params = T.Float in {
defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>;		defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>;
defm vmaxnmq : VectorVectorArithmetic<"max_predicated", (? (u32 0))>;		defm vmaxnmq : VectorVectorArithmetic<"max_predicated", (? (u32 0))>;
		def vminnmaq_m: Intrinsic<Vector, (args Vector:$a, Vector:$b, Predicate:$pred),
		(IRInt<"vminnma_predicated", [Vector,Predicate]> $a, $b, $pred)>;
		def vmaxnmaq_m: Intrinsic<Vector, (args Vector:$a, Vector:$b, Predicate:$pred),
		(IRInt<"vmaxnma_predicated", [Vector,Predicate]> $a, $b, $pred)>;
}		}

let params = T.Int in {		let params = T.Int in {
def vminvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),		def vminvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
(Scalar (IRInt<"minv", [Vector], 1> $prev, $vec))>;		(Scalar (IRInt<"minv", [Vector], 1> $prev, $vec))>;
def vmaxvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),		def vmaxvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
(Scalar (IRInt<"maxv", [Vector], 1> $prev, $vec))>;		(Scalar (IRInt<"maxv", [Vector], 1> $prev, $vec))>;
}		}
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	let params = T.Float in {
defm: compare<"le", fcmp_le>;		defm: compare<"le", fcmp_le>;
}		}

let params = T.Signed in {		let params = T.Signed in {
def vminq: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vminq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(select (icmp_sle $a, $b), $a, $b)>;		(select (icmp_sle $a, $b), $a, $b)>;
def vmaxq: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vmaxq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(select (icmp_sge $a, $b), $a, $b)>;		(select (icmp_sge $a, $b), $a, $b)>;
		def vminaq: Intrinsic<UVector, (args UVector:$a, Vector:$b),
		(seq (select (icmp_slt $b, (zeroinit Vector)),
		(sub (zeroinit Vector), $b), $b):$absb,
		(select (icmp_ule $a, $absb), $a, $absb))>;
		simon_tathamUnsubmitted Done Reply Inline Actions I wonder if we should implement at least the simple case (integer and unpredicated) using standard IR nodes instead of an IR intrinsic? We already implement `vmaxq` using an icmp and a select. We haven't implemented `vabsq` yet, but when we do, it will surely be done in a similar way, to take advantage of the existing pattern matching showcased in `llvm/test/CodeGen/Thumb2/mve-abs.ll`. So possibly we should code-generate `vmaxaq(a,b)` as if it was `vmaxq(a, vabsq(b))`, and write a more complicated isel pattern that will match that whole tree? The advantage would be that if a user had literally written a combination of `vmaxq` and `vabsq`, codegen would be able to fold them together into a single instruction at compile time. The FP versions might make sense to do the same way, using the standard `@llvm.fabs` IR intrinsic for the abs part. simon_tatham: I wonder if we should implement at least the simple case (integer and unpredicated) using…
		def vmaxaq: Intrinsic<UVector, (args UVector:$a, Vector:$b),
		(seq (select (icmp_slt $b, (zeroinit Vector)),
		(sub (zeroinit Vector), $b), $b):$absb,
		(select (icmp_uge $a, $absb), $a, $absb))>;
}		}
let params = T.Unsigned in {		let params = T.Unsigned in {
def vminqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vminqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(select (icmp_ule $a, $b), $a, $b)>,		(select (icmp_ule $a, $b), $a, $b)>,
NameOverride<"vminq">;		NameOverride<"vminq">;
def vmaxqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vmaxqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(select (icmp_uge $a, $b), $a, $b)>,		(select (icmp_uge $a, $b), $a, $b)>,
NameOverride<"vmaxq">;		NameOverride<"vmaxq">;
}		}
let params = T.Float in {		let params = T.Float in {
def vminnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vminnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRIntBase<"minnum", [Vector]> $a, $b)>;		(IRIntBase<"minnum", [Vector]> $a, $b)>;
def vmaxnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),		def vmaxnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRIntBase<"maxnum", [Vector]> $a, $b)>;		(IRIntBase<"maxnum", [Vector]> $a, $b)>;
		def vminnmaq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
		(IRIntBase<"minnum", [Vector]>
		$a, (IRIntBase<"fabs", [Vector]> $b))>;
		def vmaxnmaq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
		(IRIntBase<"maxnum", [Vector]>
		$a, (IRIntBase<"fabs", [Vector]> $b))>;
}		}

def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),		def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
(select $pred, $t, $f)> { let params = T.Usual; }		(select $pred, $t, $f)> { let params = T.Usual; }
def vpselq_64: Intrinsic<		def vpselq_64: Intrinsic<
Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),		Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),
(bitcast (select $pred, (bitcast $t, VecOf<u32>),		(bitcast (select $pred, (bitcast $t, VecOf<u32>),
(bitcast $f, VecOf<u32>)), Vector)>,		(bitcast $f, VecOf<u32>)), Vector)>,
▲ Show 20 Lines • Show All 705 Lines • Show Last 20 Lines

clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c

This file was added.

				// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s

				#include <arm_mve.h>

				// CHECK-LABEL: @test_vmaxaq_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <16 x i8> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <16 x i8> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ugt <16 x i8> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i8> [[TMP2]], <16 x i8> [[A]]
				// CHECK-NEXT: ret <16 x i8> [[TMP4]]
				//
				uint8x16_t test_vmaxaq_s8(uint8x16_t a, int8x16_t b)
				{
				#ifdef POLYMORPHIC
				return vmaxaq(a, b);
				#else /* POLYMORPHIC */
				return vmaxaq_s8(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxaq_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <8 x i16> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i16> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ugt <8 x i16> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP2]], <8 x i16> [[A]]
				// CHECK-NEXT: ret <8 x i16> [[TMP4]]
				//
				uint16x8_t test_vmaxaq_s16(uint16x8_t a, int16x8_t b)
				{
				#ifdef POLYMORPHIC
				return vmaxaq(a, b);
				#else /* POLYMORPHIC */
				return vmaxaq_s16(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxaq_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <4 x i32> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ugt <4 x i32> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> [[A]]
				// CHECK-NEXT: ret <4 x i32> [[TMP4]]
				//
				uint32x4_t test_vmaxaq_s32(uint32x4_t a, int32x4_t b)
				{
				#ifdef POLYMORPHIC
				return vmaxaq(a, b);
				#else /* POLYMORPHIC */
				return vmaxaq_s32(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxaq_m_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <16 x i8> @llvm.arm.mve.vmaxa.predicated.v16i8.v16i1(<16 x i8> [[A:%.]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
				// CHECK-NEXT: ret <16 x i8> [[TMP2]]
				//
				uint8x16_t test_vmaxaq_m_s8(uint8x16_t a, int8x16_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vmaxaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vmaxaq_m_s8(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxaq_m_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <8 x i16> @llvm.arm.mve.vmaxa.predicated.v8i16.v8i1(<8 x i16> [[A:%.]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
				// CHECK-NEXT: ret <8 x i16> [[TMP2]]
				//
				uint16x8_t test_vmaxaq_m_s16(uint16x8_t a, int16x8_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vmaxaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vmaxaq_m_s16(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxaq_m_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <4 x i32> @llvm.arm.mve.vmaxa.predicated.v4i32.v4i1(<4 x i32> [[A:%.]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				uint32x4_t test_vmaxaq_m_s32(uint32x4_t a, int32x4_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vmaxaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vmaxaq_m_s32(a, b, p);
				#endif /* POLYMORPHIC */
				}

clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c

This file was added.

				// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s

				#include <arm_mve.h>

				// CHECK-LABEL: @test_vmaxnmaq_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.]])
				// CHECK-NEXT: [[TMP1:%.]] = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[A:%.]], <8 x half> [[TMP0]])
				// CHECK-NEXT: ret <8 x half> [[TMP1]]
				//
				float16x8_t test_vmaxnmaq_f16(float16x8_t a, float16x8_t b)
				{
				#ifdef POLYMORPHIC
				return vmaxnmaq(a, b);
				#else /* POLYMORPHIC */
				return vmaxnmaq_f16(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxnmaq_f32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.]])
				// CHECK-NEXT: [[TMP1:%.]] = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[A:%.]], <4 x float> [[TMP0]])
				// CHECK-NEXT: ret <4 x float> [[TMP1]]
				//
				float32x4_t test_vmaxnmaq_f32(float32x4_t a, float32x4_t b)
				{
				#ifdef POLYMORPHIC
				return vmaxnmaq(a, b);
				#else /* POLYMORPHIC */
				return vmaxnmaq_f32(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxnmaq_m_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> [[A:%.]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
				// CHECK-NEXT: ret <8 x half> [[TMP2]]
				//
				float16x8_t test_vmaxnmaq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vmaxnmaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vmaxnmaq_m_f16(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vmaxnmaq_m_f32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> [[A:%.]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vmaxnmaq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vmaxnmaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vmaxnmaq_m_f32(a, b, p);
				#endif /* POLYMORPHIC */
				}

clang/test/CodeGen/arm-mve-intrinsics/vminaq.c

This file was added.

				// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s

				#include <arm_mve.h>

				// CHECK-LABEL: @test_vminaq_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <16 x i8> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <16 x i8> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ult <16 x i8> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i8> [[TMP2]], <16 x i8> [[A]]
				// CHECK-NEXT: ret <16 x i8> [[TMP4]]
				//
				uint8x16_t test_vminaq_s8(uint8x16_t a, int8x16_t b)
				{
				#ifdef POLYMORPHIC
				return vminaq(a, b);
				#else /* POLYMORPHIC */
				return vminaq_s8(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminaq_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <8 x i16> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i16> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ult <8 x i16> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP2]], <8 x i16> [[A]]
				// CHECK-NEXT: ret <8 x i16> [[TMP4]]
				//
				uint16x8_t test_vminaq_s16(uint16x8_t a, int16x8_t b)
				{
				#ifdef POLYMORPHIC
				return vminaq(a, b);
				#else /* POLYMORPHIC */
				return vminaq_s16(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminaq_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = icmp slt <4 x i32> [[B:%.]], zeroinitializer
				// CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> zeroinitializer, [[B]]
				// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[B]]
				// CHECK-NEXT: [[TMP3:%.]] = icmp ult <4 x i32> [[TMP2]], [[A:%.]]
				// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> [[A]]
				// CHECK-NEXT: ret <4 x i32> [[TMP4]]
				//
				uint32x4_t test_vminaq_s32(uint32x4_t a, int32x4_t b)
				{
				#ifdef POLYMORPHIC
				return vminaq(a, b);
				#else /* POLYMORPHIC */
				return vminaq_s32(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminaq_m_s8(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <16 x i8> @llvm.arm.mve.vmina.predicated.v16i8.v16i1(<16 x i8> [[A:%.]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
				// CHECK-NEXT: ret <16 x i8> [[TMP2]]
				//
				uint8x16_t test_vminaq_m_s8(uint8x16_t a, int8x16_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vminaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vminaq_m_s8(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminaq_m_s16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <8 x i16> @llvm.arm.mve.vmina.predicated.v8i16.v8i1(<8 x i16> [[A:%.]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
				// CHECK-NEXT: ret <8 x i16> [[TMP2]]
				//
				uint16x8_t test_vminaq_m_s16(uint16x8_t a, int16x8_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vminaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vminaq_m_s16(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminaq_m_s32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <4 x i32> @llvm.arm.mve.vmina.predicated.v4i32.v4i1(<4 x i32> [[A:%.]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x i32> [[TMP2]]
				//
				uint32x4_t test_vminaq_m_s32(uint32x4_t a, int32x4_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vminaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vminaq_m_s32(a, b, p);
				#endif /* POLYMORPHIC */
				}

clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c

This file was added.

				// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
				// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s

				#include <arm_mve.h>

				// CHECK-LABEL: @test_vminnmaq_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.]])
				// CHECK-NEXT: [[TMP1:%.]] = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> [[A:%.]], <8 x half> [[TMP0]])
				// CHECK-NEXT: ret <8 x half> [[TMP1]]
				//
				float16x8_t test_vminnmaq_f16(float16x8_t a, float16x8_t b)
				{
				#ifdef POLYMORPHIC
				return vminnmaq(a, b);
				#else /* POLYMORPHIC */
				return vminnmaq_f16(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminnmaq_f32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.]])
				// CHECK-NEXT: [[TMP1:%.]] = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> [[A:%.]], <4 x float> [[TMP0]])
				// CHECK-NEXT: ret <4 x float> [[TMP1]]
				//
				float32x4_t test_vminnmaq_f32(float32x4_t a, float32x4_t b)
				{
				#ifdef POLYMORPHIC
				return vminnmaq(a, b);
				#else /* POLYMORPHIC */
				return vminnmaq_f32(a, b);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminnmaq_m_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> [[A:%.]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
				// CHECK-NEXT: ret <8 x half> [[TMP2]]
				//
				float16x8_t test_vminnmaq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vminnmaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vminnmaq_m_f16(a, b, p);
				#endif /* POLYMORPHIC */
				}

				// CHECK-LABEL: @test_vminnmaq_m_f32(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = tail call <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> [[A:%.]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vminnmaq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
				{
				#ifdef POLYMORPHIC
				return vminnmaq_m(a, b, p);
				#else /* POLYMORPHIC */
				return vminnmaq_m_f32(a, b, p);
				#endif /* POLYMORPHIC */
				}

llvm/include/llvm/IR/IntrinsicsARM.td

Show First 20 Lines • Show All 870 Lines • ▼ Show 20 Lines	def int_arm_mve_rhadd_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;		llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_qsub_predicated: Intrinsic<[llvm_anyvector_ty],		def int_arm_mve_qsub_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;		llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_hsub_predicated: Intrinsic<[llvm_anyvector_ty],		def int_arm_mve_hsub_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;		llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
		def int_arm_mve_vmina_predicated: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
		[IntrNoMem]>;
		def int_arm_mve_vmaxa_predicated: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
		[IntrNoMem]>;
		def int_arm_mve_vminnma_predicated: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
		[IntrNoMem]>;
		def int_arm_mve_vmaxnma_predicated: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
		[IntrNoMem]>;

defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],		defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
[llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;		[llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],		defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
[llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;		[llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;

multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,		multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
LLVMType pred = llvm_anyvector_ty,		LLVMType pred = llvm_anyvector_ty,
▲ Show 20 Lines • Show All 238 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrMVE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,285 Lines • ▼ Show 20 Lines	class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
let Inst{11-6} = 0b111010;		let Inst{11-6} = 0b111010;
let Inst{5} = Qm{3};		let Inst{5} = Qm{3};
let Inst{4} = 0b0;		let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};		let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;		let Inst{0} = 0b1;
let validForTailPredication = 1;		let validForTailPredication = 1;
}		}

def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>;		multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI,
def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>;		SDNode unpred_op, Intrinsic pred_int, bit bit_12> {
def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>;		def "" : MVE_VMINMAXA<iname, VTI.Suffix, VTI.Size, bit_12>;
		defvar Inst = !cast<Instruction>(NAME);
def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>;
def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>;		let Predicates = [HasMVEInt] in {
def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>;		// Unpredicated v(min\|max)a
		def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (abs (VTI.Vec MQPR:$Qm)))),
		(VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;

		// Predicated v(min\|max)a
		def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
		(VTI.Pred VCCR:$mask))),
		(VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
		ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
		}
		}

		multiclass MVE_VMINA<MVEVectorVTInfo VTI>
		: MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>;

		defm MVE_VMAXAs8 : MVE_VMINA<MVE_v16s8>;
		defm MVE_VMAXAs16 : MVE_VMINA<MVE_v8s16>;
		defm MVE_VMAXAs32 : MVE_VMINA<MVE_v4s32>;

		multiclass MVE_VMAXA<MVEVectorVTInfo VTI>
		: MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>;

		defm MVE_VMINAs8 : MVE_VMAXA<MVE_v16s8>;
		defm MVE_VMINAs16 : MVE_VMAXA<MVE_v8s16>;
		defm MVE_VMINAs32 : MVE_VMAXA<MVE_v4s32>;

// end of MVE Integer instructions		// end of MVE Integer instructions

// start of mve_imm_shift instructions		// start of mve_imm_shift instructions

def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),		def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
(ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),		(ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),
NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",		NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",
▲ Show 20 Lines • Show All 1,309 Lines • ▼ Show 20 Lines	class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
let Inst{12} = bit_12;		let Inst{12} = bit_12;
let Inst{11-6} = 0b111010;		let Inst{11-6} = 0b111010;
let Inst{5} = Qm{3};		let Inst{5} = Qm{3};
let Inst{4} = 0b0;		let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};		let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;		let Inst{0} = 0b1;
}		}

def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>;		multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>;		SDNode unpred_op, Intrinsic pred_int,
		bit bit_12> {
		def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>;
		defvar Inst = !cast<Instruction>(NAME);

		let Predicates = [HasMVEInt] in {
		// Unpredicated v(max\|min)nma
		def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (fabs (VTI.Vec MQPR:$Qm)))),
		dmgreenUnsubmitted Not Done Reply Inline Actions If I'm reading the ARMARM correctly, the fp case seems to preform the abs on both operands. dmgreen: If I'm reading the ARMARM correctly, the fp case seems to preform the abs on both operands.
		MarkMurrayARMAuthorUnsubmitted Done Reply Inline Actions My bad. Fix coming under separate cover. MarkMurrayARM: My bad. Fix coming under separate cover.
		(VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;

		// Predicated v(max\|min)nma
		def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
		(VTI.Pred VCCR:$mask))),
		(VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
		ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
		}
		}

		multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12>
		: MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>;

		defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>;
		defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>;

		multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12>
		: MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>;

def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>;		defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>;
def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>;		defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;

// end of MVE Floating Point instructions		// end of MVE Floating Point instructions

// start of MVE compares		// start of MVE compares

class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,		class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
VCMPPredicateOperand predtype, list<dag> pattern=[]>		VCMPPredicateOperand predtype, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),		: MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),
▲ Show 20 Lines • Show All 2,752 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxaq.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s

				define arm_aapcs_vfpcc <16 x i8> @test_vmaxaq_s8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxaq_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmaxa.s8 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <16 x i8> %b, zeroinitializer
				%1 = sub <16 x i8> zeroinitializer, %b
				%2 = select <16 x i1> %0, <16 x i8> %1, <16 x i8> %b
				%3 = icmp ugt <16 x i8> %2, %a
				%4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %a
				ret <16 x i8> %4
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vmaxaq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxaq_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmaxa.s16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <8 x i16> %b, zeroinitializer
				%1 = sub <8 x i16> zeroinitializer, %b
				%2 = select <8 x i1> %0, <8 x i16> %1, <8 x i16> %b
				%3 = icmp ugt <8 x i16> %2, %a
				%4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %a
				ret <8 x i16> %4
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vmaxaq_s32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxaq_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmaxa.s32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <4 x i32> %b, zeroinitializer
				%1 = sub <4 x i32> zeroinitializer, %b
				%2 = select <4 x i1> %0, <4 x i32> %1, <4 x i32> %b
				%3 = icmp ugt <4 x i32> %2, %a
				%4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> %a
				ret <4 x i32> %4
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vmaxaq_m_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vmaxaq_m_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmaxat.s8 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
				%2 = tail call <16 x i8> @llvm.arm.mve.vmaxa.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
				ret <16 x i8> %2
				}

				declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2

				declare <16 x i8> @llvm.arm.mve.vmaxa.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>) #2

				define arm_aapcs_vfpcc <8 x i16> @test_vmaxaq_m_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vmaxaq_m_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmaxat.s16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
				%2 = tail call <8 x i16> @llvm.arm.mve.vmaxa.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
				ret <8 x i16> %2
				}

				declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2

				declare <8 x i16> @llvm.arm.mve.vmaxa.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>) #2

				define arm_aapcs_vfpcc <4 x i32> @test_vmaxaq_m_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vmaxaq_m_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmaxat.s32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmaxa.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
				ret <4 x i32> %2
				}

				declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2

				declare <4 x i32> @llvm.arm.mve.vmaxa.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>) #2

llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmaq.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s

				define arm_aapcs_vfpcc <8 x half> @test_vmaxnmaq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxnmaq_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmaxnma.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
				%1 = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> %a, <8 x half> %0)
				ret <8 x half> %1
				}

				declare <8 x half> @llvm.fabs.v8f16(<8 x half>) #1

				declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) #1

				define arm_aapcs_vfpcc <4 x float> @test_vmaxnmaq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxnmaq_f32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmaxnma.f32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
				%1 = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %0)
				ret <4 x float> %1
				}

				declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1

				declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1

				define arm_aapcs_vfpcc <8 x half> @test_vmaxnmaq_m_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxnmaq_m_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmaxnmat.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
				%2 = tail call <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1)
				ret <8 x half> %2
				}

				declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2

				declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) #2

				define arm_aapcs_vfpcc <4 x float> @test_vmaxnmaq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vmaxnmaq_m_f32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vmaxnmat.f32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1)
				ret <4 x float> %2
				}

				declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2

				declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) #2

llvm/test/CodeGen/Thumb2/mve-intrinsics/vminaq.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s

				define arm_aapcs_vfpcc <16 x i8> @test_vminaq_s8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminaq_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmina.s8 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <16 x i8> %b, zeroinitializer
				%1 = sub <16 x i8> zeroinitializer, %b
				%2 = select <16 x i1> %0, <16 x i8> %1, <16 x i8> %b
				%3 = icmp ult <16 x i8> %2, %a
				%4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %a
				ret <16 x i8> %4
				}

				define arm_aapcs_vfpcc <8 x i16> @test_vminaq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminaq_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmina.s16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <8 x i16> %b, zeroinitializer
				%1 = sub <8 x i16> zeroinitializer, %b
				%2 = select <8 x i1> %0, <8 x i16> %1, <8 x i16> %b
				%3 = icmp ult <8 x i16> %2, %a
				%4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %a
				ret <8 x i16> %4
				}

				define arm_aapcs_vfpcc <4 x i32> @test_vminaq_s32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminaq_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmina.s32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = icmp slt <4 x i32> %b, zeroinitializer
				%1 = sub <4 x i32> zeroinitializer, %b
				%2 = select <4 x i1> %0, <4 x i32> %1, <4 x i32> %b
				%3 = icmp ult <4 x i32> %2, %a
				%4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> %a
				ret <4 x i32> %4
				}

				define arm_aapcs_vfpcc <16 x i8> @test_vminaq_m_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vminaq_m_s8:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vminat.s8 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
				%2 = tail call <16 x i8> @llvm.arm.mve.vmina.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
				ret <16 x i8> %2
				}

				declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2

				declare <16 x i8> @llvm.arm.mve.vmina.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>) #2

				define arm_aapcs_vfpcc <8 x i16> @test_vminaq_m_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vminaq_m_s16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vminat.s16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
				%2 = tail call <8 x i16> @llvm.arm.mve.vmina.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
				ret <8 x i16> %2
				}

				declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2

				declare <8 x i16> @llvm.arm.mve.vmina.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>) #2

				define arm_aapcs_vfpcc <4 x i32> @test_vminaq_m_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
				; CHECK-LABEL: test_vminaq_m_s32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vminat.s32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x i32> @llvm.arm.mve.vmina.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
				ret <4 x i32> %2
				}

				declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2

				declare <4 x i32> @llvm.arm.mve.vmina.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>) #2

llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmaq.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s

				define arm_aapcs_vfpcc <8 x half> @test_vminnmaq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminnmaq_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vminnma.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
				%1 = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> %a, <8 x half> %0)
				ret <8 x half> %1
				}

				declare <8 x half> @llvm.fabs.v8f16(<8 x half>) #1

				declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) #1

				define arm_aapcs_vfpcc <4 x float> @test_vminnmaq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminnmaq_f32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vminnma.f32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
				%1 = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %0)
				ret <4 x float> %1
				}

				declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1

				declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1

				define arm_aapcs_vfpcc <8 x half> @test_vminnmaq_m_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminnmaq_m_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vminnmat.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
				%2 = tail call <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1)
				ret <8 x half> %2
				}

				declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2

				declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) #2

				define arm_aapcs_vfpcc <4 x float> @test_vminnmaq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
				; CHECK-LABEL: test_vminnmaq_m_f32:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vminnmat.f32 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1)
				ret <4 x float> %2
				}

				declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2

				declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) #2