diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -596,6 +596,118 @@ (xval $pair, 0))>; } +multiclass MVEBinaryVectorHoriz32 { + def xsuffix#"q" + : Intrinsic + (unsignedflag Scalar), subtract, exchange, + (zeroinit Scalar32), $a, $b)>; + def xsuffix#"q_p" + : Intrinsic + (unsignedflag Scalar), subtract, exchange, + (zeroinit Scalar32), $a, $b, $pred)>; + + def "a"#xsuffix#"q" + : Intrinsic + (unsignedflag Scalar), subtract, exchange, + $a, $b, $c)>; + def "a"#xsuffix#"q_p" + : Intrinsic + (unsignedflag Scalar), subtract, exchange, + $a, $b, $c, $pred)>; +} + +class IntrSplit64 + : Intrinsic; + +class IntrSplit64ZeroInit + : Intrinsic; + +multiclass MVEBinaryVectorHoriz64Base { + def xsuffix#"q" + : IntrSplit64ZeroInit + (unsignedflag Scalar), subtract, exchange, + $lo, $hi, $a, $b)>; + def xsuffix#"q_p" + : IntrSplit64ZeroInit + (unsignedflag Scalar), subtract, exchange, + $lo, $hi, $a, $b, $pred)>; + + def "a"#xsuffix#"q" + : IntrSplit64 + (unsignedflag Scalar), subtract, exchange, + $lo, $hi, $b, $c)>; + def "a"#xsuffix#"q_p" + : IntrSplit64 + (unsignedflag Scalar), subtract, exchange, + $lo, $hi, $b, $c, $pred)>; +} + +multiclass MVEBinaryVectorHoriz64 { + defm "" : MVEBinaryVectorHoriz64Base; +} + +multiclass MVEBinaryVectorHoriz64R { + defm "" : MVEBinaryVectorHoriz64Base; +} + +let params = T.Int in { +def vabavq : Intrinsic (unsignedflag Scalar), $a, $b, $c)>; +def vabavq_p : Intrinsic + (unsignedflag Scalar), $a, $b, $c, $pred)>; + +defm vmladav : MVEBinaryVectorHoriz32; +} + +let params = T.Signed in { +defm vmladav : MVEBinaryVectorHoriz32; +defm vmlsdav : MVEBinaryVectorHoriz32; +defm vmlsdav : MVEBinaryVectorHoriz32; +} + +let params = [u16, s16, u32, s32] in +defm vmlaldav : MVEBinaryVectorHoriz64; + +let params = [s16, s32] in { +defm vmlaldav : MVEBinaryVectorHoriz64; +defm vmlsldav : MVEBinaryVectorHoriz64; +defm vmlsldav : MVEBinaryVectorHoriz64; +} + +let params = T.Int32 in +defm vrmlaldavh : MVEBinaryVectorHoriz64R; + +let params = [s32] in { +defm vrmlaldavh : MVEBinaryVectorHoriz64R; +defm vrmlsldavh : MVEBinaryVectorHoriz64R; +defm vrmlsldavh : MVEBinaryVectorHoriz64R; +} + foreach desttype = T.All in { // We want a vreinterpretq between every pair of supported vector types // _except_ that there shouldn't be one from a type to itself. diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -270,6 +270,11 @@ def UScalar: Unsigned; def UVector: VecOf; +// Expands to the 32-bit integer of the same signedness as Scalar. +def Scalar32: CopyKind; +// Expands to the 64-bit integer of the same signedness as Scalar. +def Scalar64: CopyKind; + // ----------------------------------------------------------------------------- // Internal definitions for specifying immediate arguments for an intrinsic. @@ -435,3 +440,13 @@ list All64 = Int64; list All = Usual # All64; } + +// ----------------------------------------------------------------------------- +// Container record for DAG constant values. These constants are used because +// bit/int class/multiclass parameters cannot be used to produce a dag node: +// for example (u32 x) where x is 0 is transformed into (u32 { 0 }) by the +// Tablegen parser. +def V { + dag False = (u32 0); + dag True = (u32 1); +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c @@ -0,0 +1,173 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s + +#include + +// CHECK-LABEL: @test_vabavq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_s8(uint32_t a, int8x16_t b, int8x16_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_s8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_s16(uint32_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_s32(uint32_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_u8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_u16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vabavq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) { +#ifdef POLYMORPHIC + return vabavq(a, b, c); +#else + return vabavq_u32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_s8(uint32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_s8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_s16(uint32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_s32(uint32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_u8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_u16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vabavq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vabavq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vabavq_p(a, b, c, p); +#else + return vabavq_p_u32(a, b, c, p); +#endif +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c @@ -0,0 +1,845 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s + +#include + +// CHECK-LABEL: @test_vmladavaq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaq_s8(int32_t a, int8x16_t b, int8x16_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_s8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaq_s16(int32_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaq_s32(int32_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavaq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_u8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavaq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_u16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavaq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) { +#ifdef POLYMORPHIC + return vmladavaq(a, b, c); +#else + return vmladavaq_u32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) { +#ifdef POLYMORPHIC + return vmladavaxq(a, b, c); +#else + return vmladavaxq_s8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmladavaxq(a, b, c); +#else + return vmladavaxq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmladavaxq(a, b, c); +#else + return vmladavaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaq_s8(int32_t a, int8x16_t b, int8x16_t c) { +#ifdef POLYMORPHIC + return vmlsdavaq(a, b, c); +#else + return vmlsdavaq_s8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaq_s16(int32_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlsdavaq(a, b, c); +#else + return vmlsdavaq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaq_s32(int32_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlsdavaq(a, b, c); +#else + return vmlsdavaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) { +#ifdef POLYMORPHIC + return vmlsdavaxq(a, b, c); +#else + return vmlsdavaxq_s8(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlsdavaxq(a, b, c); +#else + return vmlsdavaxq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlsdavaxq(a, b, c); +#else + return vmlsdavaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_s8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavaq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_u8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavaq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_u16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavaq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaq_p(a, b, c, p); +#else + return vmladavaq_p_u32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaxq_p(a, b, c, p); +#else + return vmladavaxq_p_s8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaxq_p(a, b, c, p); +#else + return vmladavaxq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavaxq_p(a, b, c, p); +#else + return vmladavaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaq_p(a, b, c, p); +#else + return vmlsdavaq_p_s8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaq_p(a, b, c, p); +#else + return vmlsdavaq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaq_p(a, b, c, p); +#else + return vmlsdavaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaxq_p(a, b, c, p); +#else + return vmlsdavaxq_p_s8(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaxq_p(a, b, c, p); +#else + return vmlsdavaxq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavaxq_p(a, b, c, p); +#else + return vmlsdavaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavq_s8(int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_s8(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavq_u8(uint8x16_t a, uint8x16_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_u8(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavq_u16(uint16x8_t a, uint16x8_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_u16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmladavq_u32(uint32x4_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vmladavq(a, b); +#else + return vmladavq_u32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavxq_s8(int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmladavxq(a, b); +#else + return vmladavxq_s8(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavxq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmladavxq(a, b); +#else + return vmladavxq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmladavxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmladavxq(a, b); +#else + return vmladavxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavq_s8(int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmlsdavq(a, b); +#else + return vmlsdavq_s8(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlsdavq(a, b); +#else + return vmlsdavq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlsdavq(a, b); +#else + return vmlsdavq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavxq_s8(int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmlsdavxq(a, b); +#else + return vmlsdavxq_s8(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavxq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlsdavxq(a, b); +#else + return vmlsdavxq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmlsdavxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlsdavxq(a, b); +#else + return vmlsdavxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_s8(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavq_p_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_u8(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_u16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmladavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavq_p(a, b, p); +#else + return vmladavq_p_u32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavxq_p(a, b, p); +#else + return vmladavxq_p_s8(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavxq_p(a, b, p); +#else + return vmladavxq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmladavxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmladavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmladavxq_p(a, b, p); +#else + return vmladavxq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavq_p(a, b, p); +#else + return vmlsdavq_p_s8(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavq_p(a, b, p); +#else + return vmlsdavq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavq_p(a, b, p); +#else + return vmlsdavq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavxq_p(a, b, p); +#else + return vmlsdavxq_p_s8(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavxq_p(a, b, p); +#else + return vmlsdavxq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsdavxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmlsdavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsdavxq_p(a, b, p); +#else + return vmlsdavxq_p_s32(a, b, p); +#endif +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c @@ -0,0 +1,1295 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s + +#include + +// CHECK-LABEL: @test_vmlaldavaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlaldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlaldavaq(a, b, c); +#else + return vmlaldavaq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlaldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlaldavaq(a, b, c); +#else + return vmlaldavaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_vmlaldavaq_u16(uint64_t a, uint16x8_t b, uint16x8_t c) { +#ifdef POLYMORPHIC + return vmlaldavaq(a, b, c); +#else + return vmlaldavaq_u16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_vmlaldavaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) { +#ifdef POLYMORPHIC + return vmlaldavaq(a, b, c); +#else + return vmlaldavaq_u32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlaldavaxq_s16(int64_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlaldavaxq(a, b, c); +#else + return vmlaldavaxq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlaldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlaldavaxq(a, b, c); +#else + return vmlaldavaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlsldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlsldavaq(a, b, c); +#else + return vmlsldavaq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlsldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlsldavaq(a, b, c); +#else + return vmlsldavaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsldaxvaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlsldaxvaq_s16(int64_t a, int16x8_t b, int16x8_t c) { +#ifdef POLYMORPHIC + return vmlsldavaxq(a, b, c); +#else + return vmlsldavaxq_s16(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vmlsldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vmlsldavaxq(a, b, c); +#else + return vmlsldavaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vrmlaldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vrmlaldavhaq(a, b, c); +#else + return vrmlaldavhaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_vrmlaldavhaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) { +#ifdef POLYMORPHIC + return vrmlaldavhaq(a, b, c); +#else + return vrmlaldavhaq_u32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vrmlaldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vrmlaldavhaxq(a, b, c); +#else + return vrmlaldavhaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vrmlsldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vrmlsldavhaq(a, b, c); +#else + return vrmlsldavhaq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhaxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_vrmlsldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) { +#ifdef POLYMORPHIC + return vrmlsldavhaxq(a, b, c); +#else + return vrmlsldavhaxq_s32(a, b, c); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlaldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaq_p(a, b, c, p); +#else + return vmlaldavaq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlaldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaq_p(a, b, c, p); +#else + return vmlaldavaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +uint64_t test_vmlaldavaq_p_u16(uint64_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaq_p(a, b, c, p); +#else + return vmlaldavaq_p_u16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +uint64_t test_vmlaldavaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaq_p(a, b, c, p); +#else + return vmlaldavaq_p_u32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlaldavaxq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaxq_p(a, b, c, p); +#else + return vmlaldavaxq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlaldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavaxq_p(a, b, c, p); +#else + return vmlaldavaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlsldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavaq_p(a, b, c, p); +#else + return vmlsldavaq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlsldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavaq_p(a, b, c, p); +#else + return vmlsldavaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldaxvaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlsldaxvaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavaxq_p(a, b, c, p); +#else + return vmlsldavaxq_p_s16(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vmlsldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavaxq_p(a, b, c, p); +#else + return vmlsldavaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vrmlaldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhaq_p(a, b, c, p); +#else + return vrmlaldavhaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +uint64_t test_vrmlaldavhaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhaq_p(a, b, c, p); +#else + return vrmlaldavhaq_p_u32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vrmlaldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhaxq_p(a, b, c, p); +#else + return vrmlaldavhaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vrmlsldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlsldavhaq_p(a, b, c, p); +#else + return vrmlsldavhaq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhaxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]] +// CHECK-NEXT: ret i64 [[TMP11]] +// +int64_t test_vrmlsldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlsldavhaxq_p(a, b, c, p); +#else + return vrmlsldavhaxq_p_s32(a, b, c, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlaldavq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlaldavq(a, b); +#else + return vmlaldavq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlaldavq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlaldavq(a, b); +#else + return vmlaldavq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_vmlaldavq_u16(uint16x8_t a, uint16x8_t b) { +#ifdef POLYMORPHIC + return vmlaldavq(a, b); +#else + return vmlaldavq_u16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_vmlaldavq_u32(uint32x4_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vmlaldavq(a, b); +#else + return vmlaldavq_u32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlaldavxq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlaldavxq(a, b); +#else + return vmlaldavxq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlaldavxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlaldavxq(a, b); +#else + return vmlaldavxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsldavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlsldavq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlsldavq(a, b); +#else + return vmlsldavq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsldavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlsldavq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlsldavq(a, b); +#else + return vmlsldavq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsldavxvq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlsldavxvq_s16(int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmlsldavxq(a, b); +#else + return vmlsldavxq_s16(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlsldavxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vmlsldavxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmlsldavxq(a, b); +#else + return vmlsldavxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vrmlaldavhq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vrmlaldavhq(a, b); +#else + return vrmlaldavhq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_vrmlaldavhq_u32(uint32x4_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vrmlaldavhq(a, b); +#else + return vrmlaldavhq_u32(a, b); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vrmlaldavhxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vrmlaldavhxq(a, b); +#else + return vrmlaldavhxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vrmlsldavhq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vrmlsldavhq(a, b); +#else + return vrmlsldavhq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +int64_t test_vrmlsldavhxq_s32(int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vrmlsldavhxq(a, b); +#else + return vrmlsldavhxq_s32(a, b); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlaldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavq_p(a, b, p); +#else + return vmlaldavq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlaldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavq_p(a, b, p); +#else + return vmlaldavq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +uint64_t test_vmlaldavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavq_p(a, b, p); +#else + return vmlaldavq_p_u16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +uint64_t test_vmlaldavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavq_p(a, b, p); +#else + return vmlaldavq_p_u32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavxq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlaldavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavxq_p(a, b, p); +#else + return vmlaldavxq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlaldavxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlaldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaldavxq_p(a, b, p); +#else + return vmlaldavxq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlsldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavq_p(a, b, p); +#else + return vmlsldavq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlsldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavq_p(a, b, p); +#else + return vmlsldavq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldaxvq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlsldaxvq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavxq_p(a, b, p); +#else + return vmlsldavxq_p_s16(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vmlsldavxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vmlsldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlsldavxq_p(a, b, p); +#else + return vmlsldavxq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vrmlaldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhq_p(a, b, p); +#else + return vrmlaldavhq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +uint64_t test_vrmlaldavhq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhq_p(a, b, p); +#else + return vrmlaldavhq_p_u32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vrmlaldavhxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vrmlaldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlaldavhxq_p(a, b, p); +#else + return vrmlaldavhxq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vrmlsldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlsldavhq_p(a, b, p); +#else + return vrmlsldavhq_p_s32(a, b, p); +#endif +} + +// CHECK-LABEL: @test_vrmlsldavhxq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +// CHECK-NEXT: ret i64 [[TMP8]] +// +int64_t test_vrmlsldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vrmlsldavhxq_p(a, b, p); +#else + return vrmlsldavhxq_p_s32(a, b, p); +#endif +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -931,4 +931,51 @@ def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; +// MVE vector absolute difference and accumulate across vector +// The first operand is an 'unsigned' flag. The remaining operands are: +// * accumulator +// * first vector operand +// * second vector operand +// * mask (only in predicated versions) +defm int_arm_mve_vabav: MVEPredicated< + [llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty, + [IntrNoMem]>; + +// The following 3 instrinsics are MVE vector reductions with two vector +// operands. +// The first 3 operands are boolean flags (must be compile-time constants): +// * unsigned - the instruction operates on vectors of unsigned values and +// unsigned scalars +// * subtract - the instruction performs subtraction after multiplication of +// lane pairs (e.g., vmlsdav vs vmladav) +// * exchange - the instruction exchanges successive even and odd lanes of +// the first operands before multiplication of lane pairs +// (e.g., vmladavx vs vmladav) +// The remaining operands are: +// * accumulator +// * first vector operand +// * second vector operand +// * mask (only in predicated versions) + +// Version with 32-bit result, vml{a,s}dav[a][x] +defm int_arm_mve_vmldava: MVEPredicated< + [llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], + llvm_anyvector_ty, [IntrNoMem]>; + +// Version with 64-bit result, vml{a,s}ldav[a][x] +defm int_arm_mve_vmlldava: MVEPredicated< + [llvm_i32_ty, llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], + llvm_anyvector_ty, [IntrNoMem]>; + +// Version with 72-bit rounded result, vrml{a,s}ldavh[a][x] +defm int_arm_mve_vrmlldavha: MVEPredicated< + [llvm_i32_ty, llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], + llvm_anyvector_ty, [IntrNoMem]>; } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -233,6 +233,42 @@ void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + + /// Select MVE "vector absolute difference and accumulate across vector" + /// intrinsics (arm_mve_vabav, arm_mve_vabav_predicated). + /// OpcodesS contains opcodes for the signed instructions (s8, s16, s32), + /// and OpcodesU for the unsigned ones. + void SelectMVE_VABAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + + /// Select MVE vector reductions with two vector operands + /// Long means that the instruction operates on a long accumulator (64-bit + /// or 72-bit with rounding), i.e. uses two GPRs for the scalar operand. + /// Stride is the number of vector element widths the instruction can operate + /// on: + /// 3 for short variants, vml{a,s}dav[a][x]: [i8, i16, i32] + /// 2 for long non-rounding variants, vml{a,s}ldav[a][x]: [i16, i32] + /// 1 for long rounding variants: vrml{a,s}ldavh[a][x]: [i32] + /// Stride is used when addressing the OpcodesS array which contains multiple + /// opcodes for each element width. + /// TySize is the index into the list of element types listed above + void SelectBaseMVE_VMLDAV(SDNode *N, bool Predicated, bool Long, + const uint16_t *OpcodesS, const uint16_t *OpcodesU, + size_t Stride, size_t TySize); + + /// Select a 32-bit MVE vector reduction with two vector operands, + /// arm_mve_vmldava[_predicated] + void SelectMVE_VMLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + /// Select a 64-bit MVE vector reduction with two vector operands + /// arm_mve_vmlldava_[predicated] + void SelectMVE_VMLLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + /// Select a 72-bit MVE vector rounding reduction with two vector operands + /// int_arm_mve_vrmlldavha[_predicated] + void SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, const uint16_t *OpcodesS, + const uint16_t *OpcodesU); + /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs /// should be 2 or 4. The opcode array specifies the instructions /// used for 8, 16 and 32-bit lane sizes respectively, and each @@ -2517,6 +2553,163 @@ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +static bool SDValueToConstBool(SDValue SDVal) { + assert(isa(SDVal) && "expected a compile-time constant"); + ConstantSDNode *SDValConstant = dyn_cast(SDVal); + uint64_t Value = SDValConstant->getZExtValue(); + assert((Value == 0 || Value == 1) && "expected value 0 or 1"); + return Value; +} + +void ARMDAGToDAGISel::SelectMVE_VABAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + bool IsUnsigned = SDValueToConstBool(N->getOperand(1)); + const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS; + uint16_t Opcode; + EVT VecTy = N->getOperand(3).getValueType(); + switch (VecTy.getVectorElementType().getSizeInBits()) { + case 8: + Opcode = Opcodes[0]; + break; + case 16: + Opcode = Opcodes[1]; + break; + case 32: + Opcode = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VABAV"); + } + + SDLoc Loc(N); + SmallVector Ops; + for (int i = 2; i < 5; ++i) + Ops.push_back(N->getOperand(i)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(5)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectBaseMVE_VMLDAV(SDNode *N, bool Predicated, + bool Long, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU, + size_t Stride, + size_t TySize) { + assert(TySize < Stride && "Invalid TySize"); + bool IsUnsigned = SDValueToConstBool(N->getOperand(1)); + bool IsSub = SDValueToConstBool(N->getOperand(2)); + bool IsExchange = SDValueToConstBool(N->getOperand(3)); + if (IsUnsigned) { + assert(!IsSub && + "Unsigned versions of vmls[l]dav[a]/vrmlsldavh[a] do not exist"); + assert(!IsExchange && + "Unsigned versions of vmla[l]dav[a]x/vrmlaldavh[a]x do not exist"); + } + + // For instructions with long (64-bit) scalar output, operands 4 and 5 are the + // input accumulator value (low, high) + // Otherwise, operand 4 is the input accumulator value + bool Acc1IsZero = false; + if (ConstantSDNode *Acc1Const = dyn_cast(N->getOperand(4))) + if (Acc1Const->getZExtValue() == 0) + Acc1IsZero = true; + bool Acc2IsZero = false; + if (Long) { + if (ConstantSDNode *Acc2Const = dyn_cast(N->getOperand(5))) + if (Acc2Const->getZExtValue() == 0) + Acc2IsZero = true; + } else + Acc2IsZero = true; + + // If the input accumulator value is not zero, select an instruction with + // accumulator, otherwise select an instruction without accumulator + bool IsAccum = !(Acc1IsZero && Acc2IsZero); + + const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS; + if (IsSub) Opcodes += 4 * Stride; + if (IsExchange) Opcodes += 2 * Stride; + if (IsAccum) Opcodes += Stride; + uint16_t Opcode = Opcodes[TySize]; + + SDLoc Loc(N); + SmallVector Ops; + // Push the accumulator operands, if they are used + if (IsAccum) { + Ops.push_back(N->getOperand(4)); + if (Long) + Ops.push_back(N->getOperand(5)); + } + + int NumAccumOperands = Long ? 2 : 1; + // Push the two vector operands + for (int i = 4 + NumAccumOperands; i < 6 + NumAccumOperands; ++i) + Ops.push_back(N->getOperand(i)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(6 + NumAccumOperands)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectMVE_VMLDAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + + EVT VecTy = N->getOperand(5).getValueType(); + size_t SizeIndex; + switch (VecTy.getVectorElementType().getSizeInBits()) { + case 8: + SizeIndex = 0; + break; + case 16: + SizeIndex = 1; + break; + case 32: + SizeIndex = 2; + break; + default: + llvm_unreachable("bad vector element size"); + } + + SelectBaseMVE_VMLDAV(N, Predicated, false, OpcodesS, OpcodesU, 3, SizeIndex); +} + +void ARMDAGToDAGISel::SelectMVE_VMLLDAV(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + EVT VecTy = N->getOperand(6).getValueType(); + size_t SizeIndex; + switch (VecTy.getVectorElementType().getSizeInBits()) { + case 16: + SizeIndex = 0; + break; + case 32: + SizeIndex = 1; + break; + default: + llvm_unreachable("bad vector element size"); + } + + SelectBaseMVE_VMLDAV(N, Predicated, true, OpcodesS, OpcodesU, 2, SizeIndex); +} + +void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, + const uint16_t *OpcodesS, + const uint16_t *OpcodesU) { + EVT VecTy = N->getOperand(6).getValueType(); + assert(VecTy.getVectorElementType().getSizeInBits() == 32 && + "bad vector element size"); + SelectBaseMVE_VMLDAV(N, Predicated, true, OpcodesS, OpcodesU, 1, 0); +} + void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, const uint16_t *const *Opcodes) { EVT VT = N->getValueType(0); @@ -4361,6 +4554,76 @@ SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true, IntNo == Intrinsic::arm_mve_vadc_predicated); return; + + case Intrinsic::arm_mve_vabav: + case Intrinsic::arm_mve_vabav_predicated: { + static const uint16_t OpcodesS[] = {ARM::MVE_VABAVs8, ARM::MVE_VABAVs16, + ARM::MVE_VABAVs32}; + static const uint16_t OpcodesU[] = {ARM::MVE_VABAVu8, ARM::MVE_VABAVu16, + ARM::MVE_VABAVu32}; + SelectMVE_VABAV(N, IntNo == Intrinsic::arm_mve_vabav_predicated, + OpcodesS, OpcodesU); + return; + } + + case Intrinsic::arm_mve_vmldava: + case Intrinsic::arm_mve_vmldava_predicated: { + static const uint16_t OpcodesU[] = { + ARM::MVE_VMLADAVu8, ARM::MVE_VMLADAVu16, ARM::MVE_VMLADAVu32, + ARM::MVE_VMLADAVau8, ARM::MVE_VMLADAVau16, ARM::MVE_VMLADAVau32, + }; + static const uint16_t OpcodesS[] = { + ARM::MVE_VMLADAVs8, ARM::MVE_VMLADAVs16, ARM::MVE_VMLADAVs32, + ARM::MVE_VMLADAVas8, ARM::MVE_VMLADAVas16, ARM::MVE_VMLADAVas32, + ARM::MVE_VMLADAVxs8, ARM::MVE_VMLADAVxs16, ARM::MVE_VMLADAVxs32, + ARM::MVE_VMLADAVaxs8, ARM::MVE_VMLADAVaxs16, ARM::MVE_VMLADAVaxs32, + ARM::MVE_VMLSDAVs8, ARM::MVE_VMLSDAVs16, ARM::MVE_VMLSDAVs32, + ARM::MVE_VMLSDAVas8, ARM::MVE_VMLSDAVas16, ARM::MVE_VMLSDAVas32, + ARM::MVE_VMLSDAVxs8, ARM::MVE_VMLSDAVxs16, ARM::MVE_VMLSDAVxs32, + ARM::MVE_VMLSDAVaxs8, ARM::MVE_VMLSDAVaxs16, ARM::MVE_VMLSDAVaxs32, + }; + SelectMVE_VMLDAV(N, IntNo == Intrinsic::arm_mve_vmldava_predicated, + OpcodesS, OpcodesU); + return; + } + + case Intrinsic::arm_mve_vmlldava: + case Intrinsic::arm_mve_vmlldava_predicated: { + static const uint16_t OpcodesU[] = { + ARM::MVE_VMLALDAVu16, ARM::MVE_VMLALDAVu32, + ARM::MVE_VMLALDAVau16, ARM::MVE_VMLALDAVau32, + }; + static const uint16_t OpcodesS[] = { + ARM::MVE_VMLALDAVs16, ARM::MVE_VMLALDAVs32, + ARM::MVE_VMLALDAVas16, ARM::MVE_VMLALDAVas32, + ARM::MVE_VMLALDAVxs16, ARM::MVE_VMLALDAVxs32, + ARM::MVE_VMLALDAVaxs16, ARM::MVE_VMLALDAVaxs32, + ARM::MVE_VMLSLDAVs16, ARM::MVE_VMLSLDAVs32, + ARM::MVE_VMLSLDAVas16, ARM::MVE_VMLSLDAVas32, + ARM::MVE_VMLSLDAVxs16, ARM::MVE_VMLSLDAVxs32, + ARM::MVE_VMLSLDAVaxs16, ARM::MVE_VMLSLDAVaxs32, + }; + SelectMVE_VMLLDAV(N, IntNo == Intrinsic::arm_mve_vmlldava_predicated, + OpcodesS, OpcodesU); + return; + } + + case Intrinsic::arm_mve_vrmlldavha: + case Intrinsic::arm_mve_vrmlldavha_predicated: { + static const uint16_t OpcodesU[] = { + ARM::MVE_VRMLALDAVHu32, ARM::MVE_VRMLALDAVHau32, + }; + static const uint16_t OpcodesS[] = { + ARM::MVE_VRMLALDAVHs32, ARM::MVE_VRMLALDAVHas32, + ARM::MVE_VRMLALDAVHxs32, ARM::MVE_VRMLALDAVHaxs32, + ARM::MVE_VRMLSLDAVHs32, ARM::MVE_VRMLSLDAVHas32, + ARM::MVE_VRMLSLDAVHxs32, ARM::MVE_VRMLSLDAVHaxs32, + }; + SelectMVE_VRMLLDAVH(N, IntNo == Intrinsic::arm_mve_vrmlldavha_predicated, + OpcodesS, OpcodesU); + return; + } + } break; } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare i32 @llvm.arm.mve.vabav.v16i8(i32, i32, <16 x i8>, <16 x i8>) +declare i32 @llvm.arm.mve.vabav.v8i16(i32, i32, <8 x i16>, <8 x i16>) +declare i32 @llvm.arm.mve.vabav.v4i32(i32, i32, <4 x i32>, <4 x i32>) + +declare i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32, i32, <16 x i8>, <16 x i8>, <16 x i1>) +declare i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32, i32, <8 x i16>, <8 x i16>, <8 x i1>) +declare i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32, i32, <4 x i32>, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc i32 @test_vabavq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vabavq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vabavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vabavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vabavq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vabavq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vabavq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabav.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vabavq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vabavq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vabavt.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll @@ -0,0 +1,734 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare i32 @llvm.arm.mve.vmldava.v16i8(i32, i32, i32, i32, <16 x i8>, <16 x i8>) +declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>) +declare i32 @llvm.arm.mve.vmldava.v4i32(i32, i32, i32, i32, <4 x i32>, <4 x i32>) + +declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>) +declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>) +declare i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc i32 @test_vmladavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmladavaq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmladavaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmladavaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmladavaq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmladavaq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmladavaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmladavaxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavax.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmladavaxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavax.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmladavaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavax.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmlsdavaq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdava.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlsdavaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdava.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlsdavaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdava.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmlsdavaxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavax.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlsdavaxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavax.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlsdavaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavax.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavat.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaxq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavaxt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavaxt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavaxt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavat.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavat.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavat.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaxq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavaxt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavaxt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavaxt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmladavq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmladavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmladavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmladavq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmladavq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmladavq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmladavxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavx.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmladavxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavx.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmladavxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmladavx.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmlsdavq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdav.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlsdavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdav.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlsdavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdav.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmlsdavxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavx.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlsdavxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavx.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlsdavxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsdavx.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.u16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlavt.u32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavxq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavxt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavxt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmladavxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmladavxt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavxq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavxt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavxt.s16 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsdavxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsdavxt.s32 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll @@ -0,0 +1,1183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>) +declare { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>) +declare { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>) + +declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>) +declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>) +declare { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlaldavaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlaldavaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u16(i64 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlaldavaq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlaldavaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlaldavaxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlaldavax.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlaldavaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlaldavax.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlsldavaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldava.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlsldavaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldava.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmlsldaxvaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldavax.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmlsldavaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldavax.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vrmlaldavhaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlalvha.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vrmlaldavhaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlalvha.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vrmlaldavhaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlaldavhax.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vrmlsldavhaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlsldavha.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vrmlsldavhaxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlsldavhax.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlaldavaxt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlaldavaxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavat.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavat.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldaxvaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavaxt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavaxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlalvhat.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlalvhat.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlaldavhaxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlsldavhaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlsldavhat.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlsldavhaxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlsldavhaxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %a, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %a to i32 + %3 = zext i16 %p to i32 + %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3) + %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4) + %6 = extractvalue { i32, i32 } %5, 1 + %7 = zext i32 %6 to i64 + %8 = shl i64 %7, 32 + %9 = extractvalue { i32, i32 } %5, 0 + %10 = zext i32 %9 to i64 + %11 = or i64 %8, %10 + ret i64 %11 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlaldavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlaldavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlaldavq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlaldavq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlaldavxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlaldavx.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlaldavxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlaldavx.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlsldavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldav.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlsldavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldav.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavxvq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlsldavxvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldavx.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlsldavxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlsldavx.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrmlaldavhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlalvh.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrmlaldavhq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlalvh.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrmlaldavhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlaldavhx.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrmlsldavhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlsldavh.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrmlsldavhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrmlsldavhx.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavxq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlaldavxt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaldavxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlaldavxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldaxvq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldaxvq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavxt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vmlsldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmlsldavxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlsldavxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlalvht.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlalvht.u32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlaldavhxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlaldavhxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlsldavhq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlsldavht.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrmlsldavhxq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrmlsldavhxt.s32 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1) + %3 = extractvalue { i32, i32 } %2, 1 + %4 = zext i32 %3 to i64 + %5 = shl i64 %4, 32 + %6 = extractvalue { i32, i32 } %2, 0 + %7 = zext i32 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +}