Index: clang/include/clang/Basic/arm_mve.td =================================================================== --- clang/include/clang/Basic/arm_mve.td +++ clang/include/clang/Basic/arm_mve.td @@ -43,6 +43,12 @@ (IRIntBase<"sadd_sat", [Vector]> $a, $b)>; def vqsubq: Intrinsic $a, $b)>; +let pnt = PNT_NType in { + def vqaddq_n: Intrinsic:$b), + (IRIntBase<"sadd_sat", [Vector]> $a, (splat $b))>; + def vqsubq_n: Intrinsic:$b), + (IRIntBase<"ssub_sat", [Vector]> $a, (splat $b))>; +} } let params = T.Unsigned in { def vqaddq_u: Intrinsic $a, $b)>, NameOverride<"vqsubq">; +let pnt = PNT_NType in { + def vqaddq_u_n: Intrinsic:$b), + (IRIntBase<"uadd_sat", [Vector]> $a, (splat $b))>, + NameOverride<"vqaddq_n">; + def vqsubq_u_n: Intrinsic:$b), + (IRIntBase<"usub_sat", [Vector]> $a, (splat $b))>, + NameOverride<"vqsubq_n">; +} } // Some intrinsics below are implemented not as IR fragments, but as @@ -85,12 +99,32 @@ def vmulltq_int: Intrinsic $a, $b, (unsignedflag Scalar), 1)>; +let pnt = PNT_NType in { + def vaddq_n: Intrinsic:$b), + (add $a, (splat $b))>; + def vsubq_n: Intrinsic:$b), + (sub $a, (splat $b))>; + def vmulq_n: Intrinsic:$b), + (mul $a, (splat $b))>; + def vhaddq_n: Intrinsic:$b), + (IRInt<"vhadd", [Vector]> $a, (splat $b), + (unsignedflag Scalar))>; + def vhsubq_n: Intrinsic:$b), + (IRInt<"vhsub", [Vector]> $a, (splat $b), + (unsignedflag Scalar))>; +} } let params = T.Signed in { def vqdmulhq: Intrinsic $a, $b)>; def vqrdmulhq: Intrinsic $a, $b)>; +let pnt = PNT_NType in { + def vqdmulhq_n: Intrinsic:$b), + (IRInt<"vqdmulh", [Vector]> $a, (splat $b))>; + def vqrdmulhq_n: Intrinsic:$b), + (IRInt<"vqrdmulh", [Vector]> $a, (splat $b))>; +} } let params = T.Poly, overrideKindLetter = "p" in { @@ -114,6 +148,18 @@ NameOverride<"vsubq">; def vmulqf: Intrinsic, NameOverride<"vmulq">; + +let pnt = PNT_NType in { + def vaddqf_n: Intrinsic:$b), + (fadd $a, (splat $b))>, + NameOverride<"vaddq_n">; + def vsubqf_n: Intrinsic:$b), + (fsub $a, (splat $b))>, + NameOverride<"vsubq_n">; + def vmulqf_n: Intrinsic:$b), + (fmul $a, (splat $b))>, + NameOverride<"vmulq_n">; +} } let params = !listconcat(T.Int16, T.Int32) in { @@ -217,6 +263,16 @@ extraArgs, (? $pred, $inactive)), wantXVariant>; } +multiclass VectorScalarArithmetic { + defm "" : IntrinsicMXNameOverride< + Vector, (args Vector:$a, unpromoted:$b, Predicate:$pred), + !con((IRInt $a, (splat $b)), + extraArgs, (? $pred, $inactive)), basename, wantXVariant, "_n", + PNT_NType, PNT_NType>; +} + multiclass VectorVectorArithmeticBitcast { defm "" : IntrinsicMX; defm vornq : VectorVectorArithmeticBitcast<"orn_predicated">; defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">; + + defm : VectorScalarArithmetic<"add_predicated", "vaddq">; + defm : VectorScalarArithmetic<"sub_predicated", "vsubq">; + defm : VectorScalarArithmetic<"mul_predicated", "vmulq">; } multiclass DblVectorVectorArithmetic { @@ -260,6 +320,11 @@ defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>; defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>; defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>; + + defm : VectorScalarArithmetic<"qadd_predicated", "vqaddq", (? (unsignedflag Scalar)), 0>; + defm : VectorScalarArithmetic<"hadd_predicated", "vhaddq", (? (unsignedflag Scalar))>; + defm : VectorScalarArithmetic<"qsub_predicated", "vqsubq", (? (unsignedflag Scalar)), 0>; + defm : VectorScalarArithmetic<"hsub_predicated", "vhsubq", (? (unsignedflag Scalar))>; } let params = T.Signed in { defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>; @@ -268,6 +333,9 @@ (IRInt<"vmina_predicated", [UVector,Predicate]> $a, $b, $pred)>; def vmaxaq_m: Intrinsic $a, $b, $pred)>; + + defm : VectorScalarArithmetic<"qdmulh_predicated", "vqdmulhq", (?), 0>; + defm : VectorScalarArithmetic<"qrdmulh_predicated", "vqrdmulhq", (?), 0>; } let params = T.Poly, overrideKindLetter = "p" in { Index: clang/include/clang/Basic/arm_mve_defs.td =================================================================== --- clang/include/clang/Basic/arm_mve_defs.td +++ clang/include/clang/Basic/arm_mve_defs.td @@ -495,6 +495,29 @@ } } +// Same as above, but with an additional parameter 'basename' which overrides +// the C intrinsic base name +multiclass IntrinsicMXNameOverride { + def "_m" # nameSuffix: + Intrinsic, + NameOverride { + let pnt = pnt_m; + } + + foreach unusedVar = !if(!eq(wantXVariant, 1), [1], []) in { + def "_x" # nameSuffix: + Intrinsic, + NameOverride { + let pnt = pnt_x; + } + } +} + + // ----------------------------------------------------------------------------- // Convenience lists of parameter types. 'T' is just a container record, so you // can define a typical intrinsic with 'let Params = T.Usual', or similar, Index: clang/test/CodeGen/arm-mve-intrinsics/vaddq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vaddq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vaddq.c @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s #include // CHECK-LABEL: @test_vaddq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[B:%.*]], [[A:%.*]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) @@ -95,3 +95,114 @@ return vaddq_x_f16(a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vaddq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vaddq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vaddq(a, b); +#else /* POLYMORPHIC */ + return vaddq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_n_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = fadd <8 x half> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b) +{ +#ifdef POLYMORPHIC + return vaddq(a, b); +#else /* POLYMORPHIC */ + return vaddq_n_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vaddq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_m_n_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vaddq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vaddq_m_n_f32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vaddq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vaddq_x(a, b, p); +#else /* POLYMORPHIC */ + return vaddq_x_n_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_x_n_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef) +// CHECK-NEXT: ret <8 x half> [[TMP4]] +// +float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vaddq_x(a, b, p); +#else /* POLYMORPHIC */ + return vaddq_x_n_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + Index: clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c @@ -141,3 +141,159 @@ return vhaddq_x_u32(a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vhaddq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vhaddq_n_u8(uint8x16_t a, uint8_t b) +{ +#ifdef POLYMORPHIC + return vhaddq(a, b); +#else /* POLYMORPHIC */ + return vhaddq_n_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vhaddq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vhaddq(a, b); +#else /* POLYMORPHIC */ + return vhaddq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vhaddq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vhaddq(a, b); +#else /* POLYMORPHIC */ + return vhaddq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vhaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vhaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_m_n_u16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vhaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vhaddq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_x_n_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_x_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vhaddq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_x_n_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhaddq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vhaddq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhaddq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhaddq_x_n_u32(a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c @@ -93,3 +93,159 @@ return vhsubq_m_s32(inactive, a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vhsubq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vhsubq_n_u8(uint8x16_t a, uint8_t b) +{ +#ifdef POLYMORPHIC + return vhsubq(a, b); +#else /* POLYMORPHIC */ + return vhsubq_n_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vhsubq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vhsubq(a, b); +#else /* POLYMORPHIC */ + return vhsubq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vhsubq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vhsubq(a, b); +#else /* POLYMORPHIC */ + return vhsubq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vhsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vhsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_m_n_u16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vhsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vhsubq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_x_n_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_x_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vhsubq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_x_n_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vhsubq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vhsubq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vhsubq_x(a, b, p); +#else /* POLYMORPHIC */ + return vhsubq_x_n_u32(a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vmulq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vmulq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vmulq.c @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s #include // CHECK-LABEL: @test_vmulq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[B:%.*]], [[A:%.*]] // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vmulq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[B:%.*]], [[A:%.*]] // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vmulq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[B:%.*]], [[A:%.*]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) @@ -172,14 +172,14 @@ #endif /* POLYMORPHIC */ } -// CHECK-LABEL: @test_vmulq_m_f32( +// CHECK-LABEL: @test_vmulq_x_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) // CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef) // CHECK-NEXT: ret <4 x float> [[TMP2]] // -float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) +float32x4_t test_vmulq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) { #ifdef POLYMORPHIC return vmulq_x(a, b, p); @@ -187,3 +187,213 @@ return vmulq_x_f32(a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vmulq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vmulq_n_u8(uint8x16_t a, uint8_t b) +{ +#ifdef POLYMORPHIC + return vmulq(a, b); +#else /* POLYMORPHIC */ + return vmulq_n_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vmulq(a, b); +#else /* POLYMORPHIC */ + return vmulq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vmulq(a, b); +#else /* POLYMORPHIC */ + return vmulq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_n_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = fmul <4 x float> [[DOTSPLAT]], [[A:%.*]] +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) +{ +#ifdef POLYMORPHIC + return vmulq(a, b); +#else /* POLYMORPHIC */ + return vmulq_n_f32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vmulq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vmulq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vmulq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vmulq_m_n_u16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vmulq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vmulq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_m_n_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP4]] +// +float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vmulq_m_n_f16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vmulq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_x(a, b, p); +#else /* POLYMORPHIC */ + return vmulq_x_n_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_x_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vmulq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_x(a, b, p); +#else /* POLYMORPHIC */ + return vmulq_x_n_s16(a, b, p); +#endif /* POLYMORPHIC */ +} +// CHECK-LABEL: @test_vmulq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vmulq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_x(a, b, p); +#else /* POLYMORPHIC */ + return vmulq_x_n_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_x_n_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vmulq_x_n_f32(float32x4_t a, float32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_x(a, b, p); +#else /* POLYMORPHIC */ + return vmulq_x_n_f32(a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c @@ -93,3 +93,105 @@ return vqaddq_m_s32(inactive, a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vqaddq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vqaddq_n_u8(uint8x16_t a, uint8_t b) +{ +#ifdef POLYMORPHIC + return vqaddq(a, b); +#else /* POLYMORPHIC */ + return vqaddq_n_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqaddq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]]) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqaddq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vqaddq(a, b); +#else /* POLYMORPHIC */ + return vqaddq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqaddq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vqaddq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vqaddq(a, b); +#else /* POLYMORPHIC */ + return vqaddq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqaddq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqaddq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqaddq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vqaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqaddq_m_n_u16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqaddq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqaddq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c @@ -93,3 +93,105 @@ return vqdmulhq_m_s32(inactive, a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vqdmulhq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqdmulhq_n_s8(int8x16_t a, int8_t b) +{ +#ifdef POLYMORPHIC + return vqdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqdmulhq_n_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmulhq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]]) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vqdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqdmulhq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmulhq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) +{ +#ifdef POLYMORPHIC + return vqdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqdmulhq_n_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmulhq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmulhq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmulhq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmulhq_m_n_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmulhq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmulhq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c @@ -93,3 +93,105 @@ return vqrdmulhq_m_s32(inactive, a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vqrdmulhq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqrdmulhq_n_s8(int8x16_t a, int8_t b) +{ +#ifdef POLYMORPHIC + return vqrdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqrdmulhq_n_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmulhq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]]) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vqrdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqrdmulhq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmulhq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) +{ +#ifdef POLYMORPHIC + return vqrdmulhq(a, b); +#else /* POLYMORPHIC */ + return vqrdmulhq_n_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmulhq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqrdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqrdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmulhq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmulhq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqrdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqrdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmulhq_m_n_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmulhq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqrdmulhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmulhq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c @@ -93,3 +93,105 @@ return vqsubq_m_s32(inactive, a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vqsubq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vqsubq_n_u8(uint8x16_t a, uint8_t b) +{ +#ifdef POLYMORPHIC + return vqsubq(a, b); +#else /* POLYMORPHIC */ + return vqsubq_n_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqsubq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]]) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqsubq_n_s16(int16x8_t a, int16_t b) +{ +#ifdef POLYMORPHIC + return vqsubq(a, b); +#else /* POLYMORPHIC */ + return vqsubq_n_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqsubq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vqsubq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vqsubq(a, b); +#else /* POLYMORPHIC */ + return vqsubq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqsubq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqsubq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqsubq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vqsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqsubq_m_n_u16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqsubq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vqsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqsubq_m_n_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} Index: clang/test/CodeGen/arm-mve-intrinsics/vsubq.c =================================================================== --- clang/test/CodeGen/arm-mve-intrinsics/vsubq.c +++ clang/test/CodeGen/arm-mve-intrinsics/vsubq.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -O1 | FileCheck %s #include @@ -95,3 +95,113 @@ return vsubq_x_f16(a, b, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vsubq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = sub <4 x i32> [[A:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vsubq_n_u32(uint32x4_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vsubq(a, b); +#else /* POLYMORPHIC */ + return vsubq_n_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_n_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b) +{ +#ifdef POLYMORPHIC + return vsubq(a, b); +#else /* POLYMORPHIC */ + return vsubq_n_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vsubq_m_n_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_m_n_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vsubq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vsubq_m_n_f32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vsubq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vsubq_x(a, b, p); +#else /* POLYMORPHIC */ + return vsubq_x_n_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_x_n_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef) +// CHECK-NEXT: ret <8 x half> [[TMP4]] +// +float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vsubq_x(a, b, p); +#else /* POLYMORPHIC */ + return vsubq_x_n_f16(a, b, p); +#endif /* POLYMORPHIC */ +} Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4480,10 +4480,65 @@ let Inst{3-0} = Rm{3-0}; } +// Patterns for vector-scalar instructions with integer operands +multiclass MVE_vec_scalar_int_pat_m { + defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?)); + defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?)); + + let Predicates = [HasMVEInt] in { + // Unpredicated version + def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup GPR:$val))), + UnpredSign)), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 GPR:$val)))>; + // Predicated version + def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup GPR:$val))), + PredSign, + (pred_op (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 GPR:$val), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +// Patterns for vector-scalar instructions with FP operands +multiclass MVE_vec_scalar_fp_pat_m { + let Predicates = [HasMVEFloat] in { + // Unpredicated F16 + def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), + (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>; + // Unpredicated F32 + def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), + (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>; + // Predicated F16 + def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)), + (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), + (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)), + ARMVCCThen, (v8i1 VCCR:$mask), + (v8f16 MQPR:$inactive)))>; + // Preicated F32 + def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)), + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), + (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)), + ARMVCCThen, (v4i1 VCCR:$mask), + (v4f32 MQPR:$inactive)))>; + } +} + class MVE_VADDSUB_qr size, - bit bit_5, bit bit_12, bit bit_16, - bit bit_28, list pattern=[]> - : MVE_qDest_rSrc { + bit bit_5, bit bit_12, bit bit_16, bit bit_28> + : MVE_qDest_rSrc { let Inst{28} = bit_28; let Inst{21-20} = size; @@ -4494,42 +4549,60 @@ let validForTailPredication = 1; } -multiclass MVE_VADDSUB_qr_sizes pattern=[]> { - def "8" : MVE_VADDSUB_qr; - def "16" : MVE_VADDSUB_qr; - def "32" : MVE_VADDSUB_qr; -} - -defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>; -defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>; -defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>; - -defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; -defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; -defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} +// Vector-scalar add/sub +multiclass MVE_VADDSUB_qr_m { + def "" : MVE_VADDSUB_qr; + defm : MVE_vec_scalar_int_pat_m(NAME), VTI, + unpred_op, pred_int>; +} + +// Vector-scalar saturating add/sub +multiclass MVE_VQADDSUB_qr_m { + def "" : MVE_VADDSUB_qr; + defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s); + defm : MVE_vec_scalar_int_pat_m(NAME), VTI, + unpred_op, pred_int, 0, 1>; +} + +multiclass MVE_VADD_qr_m + : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; + +multiclass MVE_VSUB_qr_m + : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; + +multiclass MVE_VQADD_qr_m + : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat, + int_arm_mve_qadd_predicated>; + +multiclass MVE_VQSUB_qr_m + : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat, + int_arm_mve_qsub_predicated>; + +defm MVE_VADD_qr_i8 : MVE_VADD_qr_m; +defm MVE_VADD_qr_i16 : MVE_VADD_qr_m; +defm MVE_VADD_qr_i32 : MVE_VADD_qr_m; + +defm MVE_VSUB_qr_i8 : MVE_VSUB_qr_m; +defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m; +defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m; + +defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m; +defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m; +defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m; +defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m; +defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m; +defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m; + +defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m; +defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m; +defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m; +defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m; +defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m; +defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m; class MVE_VQDMULL_qr pattern=[]> @@ -4566,19 +4639,34 @@ let validForTailPredication = 1; } -def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; -def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>; -def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>; -def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>; -def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>; -def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VHADDSUB_qr_m { + def "" : MVE_VxADDSUB_qr; + defm : MVE_vec_scalar_int_pat_m(NAME), + VTI, unpred_int, pred_int, 1, 1>; +} -def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>; -def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>; -def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>; -def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>; -def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>; -def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>; +multiclass MVE_VHADD_qr_m : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, + int_arm_mve_hadd_predicated>; + +multiclass MVE_VHSUB_qr_m : + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, + int_arm_mve_hsub_predicated>; + +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; + +defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m; let Predicates = [HasMVEFloat] in { def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>; @@ -4588,6 +4676,11 @@ def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>; } +defm : MVE_vec_scalar_fp_pat_m; +defm : MVE_vec_scalar_fp_pat_m; + class MVE_VxSHL_qr size, bit bit_7, bit bit_17, list pattern=[]> : MVE_qDest_single_rSrc { @@ -4678,9 +4771,8 @@ (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; } -class MVE_VMUL_qr_int size, list pattern=[]> - : MVE_qDest_rSrc { +class MVE_VMUL_qr_int size> + : MVE_qDest_rSrc { let Inst{28} = 0b0; let Inst{21-20} = size; @@ -4691,19 +4783,16 @@ let validForTailPredication = 1; } -def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; -def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; -def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +multiclass MVE_VMUL_qr_int_m { + def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>; + defm : MVE_vec_scalar_int_pat_m(NAME), VTI, + mul, int_arm_mve_mul_predicated>; } +defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m; +defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m; +defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m; + class MVE_VxxMUL_qr bits_21_20, list pattern=[]> : MVE_qDest_rSrc { @@ -4716,19 +4805,37 @@ let Inst{5} = 0b1; } -def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>; -def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>; -def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>; +multiclass MVE_VxxMUL_qr_m { + def "" : MVE_VxxMUL_qr; + defm : MVE_vec_scalar_int_pat_m(NAME), VTI, + int_unpred, int_pred>; +} -def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; -def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; -def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; +multiclass MVE_VQDMULH_qr_m : + MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, + int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>; + +multiclass MVE_VQRDMULH_qr_m : + MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, + int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>; + +defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m; +defm MVE_VQDMULH_qr_s16 : MVE_VQDMULH_qr_m; +defm MVE_VQDMULH_qr_s32 : MVE_VQDMULH_qr_m; + +defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m; +defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m; +defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m; let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } +defm : MVE_vec_scalar_fp_pat_m; + class MVE_VFMAMLA_qr bits_21_20, bit S, list pattern=[]> Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll @@ -91,3 +91,99 @@ declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2 +define arm_aapcs_vfpcc <4 x i32> @test_vaddq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vaddq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = add <4 x i32> %.splat, %a + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) { +; CHECK-LABEL: test_vaddq_n_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vadd.f16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %2 = fadd <8 x half> %.splat, %a + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_m_n_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.f32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vaddq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_x_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_x_n_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.f16 q0, q0, r1 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %2 = zext i16 %p to i32 + %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef) + ret <8 x half> %4 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll @@ -133,3 +133,134 @@ ret <4 x i32> %2 } +define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_n_u8(<16 x i8> %a, i8 zeroext %b) { +; CHECK-LABEL: test_vhaddq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vhaddq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vhaddq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_x_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_x_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhaddq_x_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhaddt.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll @@ -90,3 +90,135 @@ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 + +define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_n_u8(<16 x i8> %a, i8 zeroext %b) { +; CHECK-LABEL: test_vhsubq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vhsubq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vhsubq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_x_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_x_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vhsubq_x_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vhsubt.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll @@ -169,3 +169,184 @@ declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2 +define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) { +; CHECK-LABEL: test_vmulq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.i8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = mul <16 x i8> %.splat, %a + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vmulq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.i16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = mul <8 x i16> %.splat, %a + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vmulq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.i32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = mul <4 x i32> %.splat, %a + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) { +; CHECK-LABEL: test_vmulq_n_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmul.f32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %0 = fmul <4 x float> %.splat, %a + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_n_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.f16 q0, q1, r1 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %2 = zext i16 %p to i32 + %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive) + ret <8 x half> %4 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_x_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_x_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_x_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_x_n_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.f32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef) + ret <4 x float> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll @@ -90,3 +90,87 @@ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 + +define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_n_u8(<16 x i8> %a, i8 zeroext %b) { +; CHECK-LABEL: test_vqaddq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqadd.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vqaddq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqadd.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vqaddq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqadd.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqaddq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqaddt.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqaddq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqaddt.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqaddq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqaddt.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll @@ -90,3 +90,87 @@ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_n_s8(<16 x i8> %a, i8 signext %b) { +; CHECK-LABEL: test_vqdmulhq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmulh.s8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vqdmulhq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmulh.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vqdmulhq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmulh.s32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmulhq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmulht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmulhq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmulht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmulhq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmulht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll @@ -90,3 +90,87 @@ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_n_s8(<16 x i8> %a, i8 signext %b) { +; CHECK-LABEL: test_vqrdmulhq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmulh.s8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vqrdmulhq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmulh.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vqrdmulhq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmulh.s32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmulhq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmulht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmulhq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmulht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmulhq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmulht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll @@ -90,3 +90,87 @@ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 + +define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_n_u8(<16 x i8> %a, i8 zeroext %b) { +; CHECK-LABEL: test_vqsubq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqsub.u8 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_n_s16(<8 x i16> %a, i16 signext %b) { +; CHECK-LABEL: test_vqsubq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqsub.s16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vqsubq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqsub.u32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqsubq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqsubt.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqsubq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqsubt.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqsubq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqsubt.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll @@ -91,3 +91,99 @@ declare <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2 +define arm_aapcs_vfpcc <4 x i32> @test_vsubq_n_u32(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_vsubq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.i32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = sub <4 x i32> %a, %.splat + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) { +; CHECK-LABEL: test_vsubq_n_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vsub.f16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %2 = fsub <8 x half> %a, %.splat + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_m_n_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.f32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsubq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_x_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_x_n_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.f16 q0, q0, r1 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %2 = zext i16 %p to i32 + %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef) + ret <8 x half> %4 +}