diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -63,9 +63,9 @@ let params = T.Int in { def vaddq: Intrinsic; def vhaddq: Intrinsic $a, $b)>; + (IRInt<"vhadd", [Vector]> $a, $b, (unsignedflag Scalar))>; def vrhaddq: Intrinsic $a, $b)>; + (IRInt<"vrhadd", [Vector]> $a, $b, (unsignedflag Scalar))>; def vandq: Intrinsic; def vbicq: Intrinsic; def veorq: Intrinsic; @@ -73,18 +73,18 @@ def vorrq: Intrinsic; def vsubq: Intrinsic; def vhsubq: Intrinsic $a, $b)>; + (IRInt<"vhsub", [Vector]> $a, $b, (unsignedflag Scalar))>; def vmulq: Intrinsic; def vmulhq: Intrinsic $a, $b)>; + (IRInt<"vmulh", [Vector]> $a, $b, (unsignedflag Scalar))>; def vrmulhq: Intrinsic $a, $b)>; + (IRInt<"vrmulh", [Vector]> $a, $b, (unsignedflag Scalar))>; def vmullbq_int: Intrinsic - $a, $b, 0)>; + $a, $b, (unsignedflag Scalar), 0)>; def vmulltq_int: Intrinsic - $a, $b, 1)>; + $a, $b, (unsignedflag Scalar), 1)>; } let params = T.Signed in { def vqdmulhq: Intrinsic $a, $b)>; + (IRInt<"vabd", [Vector]> $a, $b, (unsignedflag Scalar))>; } -multiclass VectorVectorArithmetic { - defm "" : IntrinsicMX $a, $b, - $pred, $inactive), - wantXVariant>; +multiclass VectorVectorArithmetic { + defm "" : IntrinsicMX< + Vector, (args Vector:$a, Vector:$b, Predicate:$pred), + !con((IRInt $a, $b), + extraArgs, (? $pred, $inactive)), wantXVariant>; } multiclass VectorVectorArithmeticBitcast { @@ -157,7 +157,7 @@ // Predicated intrinsics let params = T.Usual in { - defm vabdq : VectorVectorArithmetic<"abd_predicated">; + defm vabdq : VectorVectorArithmetic<"abd_predicated", (? (unsignedflag Scalar))>; defm vaddq : VectorVectorArithmetic<"add_predicated">; defm vsubq : VectorVectorArithmetic<"sub_predicated">; defm vmulq : VectorVectorArithmetic<"mul_predicated">; @@ -168,42 +168,41 @@ defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">; } -multiclass DblVectorVectorArithmetic { - defm "" : IntrinsicMX - $a, $b, top, $pred, $inactive)>; +multiclass DblVectorVectorArithmetic { + defm "" : IntrinsicMX< + DblVector, (args Vector:$a, Vector:$b, Predicate:$pred), + !con((IRInt $a, $b), + extraArgs, (? $pred, $inactive))>; } // Predicated intrinsics - Int types only let params = T.Int in { - defm vminq : VectorVectorArithmetic<"min_predicated">; - defm vmaxq : VectorVectorArithmetic<"max_predicated">; - defm vmulhq : VectorVectorArithmetic<"mulh_predicated">; - defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated">; - defm vqaddq : VectorVectorArithmetic<"qadd_predicated", 0>; - defm vhaddq : VectorVectorArithmetic<"hadd_predicated">; - defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated">; - defm vqsubq : VectorVectorArithmetic<"qsub_predicated", 0>; - defm vhsubq : VectorVectorArithmetic<"hsub_predicated">; - defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 0)>; - defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 1)>; + defm vminq : VectorVectorArithmetic<"min_predicated", (? (unsignedflag Scalar))>; + defm vmaxq : VectorVectorArithmetic<"max_predicated", (? (unsignedflag Scalar))>; + defm vmulhq : VectorVectorArithmetic<"mulh_predicated", (? (unsignedflag Scalar))>; + defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated", (? (unsignedflag Scalar))>; + defm vqaddq : VectorVectorArithmetic<"qadd_predicated", (? (unsignedflag Scalar)), 0>; + defm vhaddq : VectorVectorArithmetic<"hadd_predicated", (? (unsignedflag Scalar))>; + defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated", (? (unsignedflag Scalar))>; + defm vqsubq : VectorVectorArithmetic<"qsub_predicated", (? (unsignedflag Scalar)), 0>; + defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>; + defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>; + defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>; } let params = T.Signed in { - defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", 0>; - defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", 0>; + defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>; + defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", (?), 0>; } let params = T.Poly, overrideKindLetter = "p" in { - defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 0)>; - defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 1)>; + defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 0))>; + defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>; } // Predicated intrinsics - Float types only let params = T.Float in { - defm vminnmq : VectorVectorArithmetic<"min_predicated">; - defm vmaxnmq : VectorVectorArithmetic<"max_predicated">; + defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>; + defm vmaxnmq : VectorVectorArithmetic<"max_predicated", (? (u32 0))>; } let params = T.Int in { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vabdq_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vabdq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vabdq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x float> [[TMP2]] // float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p) @@ -98,7 +98,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -114,7 +114,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) @@ -130,7 +130,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef) // CHECK-NEXT: ret <8 x half> [[TMP2]] // float16x8_t test_vabdq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vhaddq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vhaddq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vhaddq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -98,7 +98,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -114,7 +114,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -130,7 +130,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vhsubq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vhsubq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vhsubq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vhsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vhsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vhsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c @@ -36,7 +36,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x half> [[TMP2]] // float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p) @@ -52,7 +52,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x float> [[TMP2]] // float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p) @@ -68,7 +68,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef) // CHECK-NEXT: ret <8 x half> [[TMP2]] // float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p) @@ -84,7 +84,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef) // CHECK-NEXT: ret <4 x float> [[TMP2]] // float32x4_t test_vmaxnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c @@ -53,7 +53,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -69,7 +69,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -85,7 +85,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve_pred16_t p) @@ -101,15 +101,15 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // -int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p) +uint8x16_t test_vmaxq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) { #ifdef POLYMORPHIC return vmaxq_x(a, b, p); #else /* POLYMORPHIC */ - return vmaxq_x_s8(a, b, p); + return vmaxq_x_u8(a, b, p); #endif /* POLYMORPHIC */ } @@ -117,7 +117,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -133,7 +133,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -141,6 +141,6 @@ #ifdef POLYMORPHIC return vmaxq_x(a, b, p); #else /* POLYMORPHIC */ - return vmaxq_x_u32(a, b, p); + return vmaxq_x_s32(a, b, p); #endif /* POLYMORPHIC */ } diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c @@ -36,7 +36,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x half> [[TMP2]] // float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p) @@ -52,7 +52,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x float> [[TMP2]] // float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p) @@ -68,7 +68,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef) // CHECK-NEXT: ret <8 x half> [[TMP2]] // float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p) @@ -84,7 +84,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef) // CHECK-NEXT: ret <4 x float> [[TMP2]] // float32x4_t test_vminnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c @@ -53,7 +53,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -69,7 +69,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -85,7 +85,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -101,7 +101,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -117,7 +117,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -125,7 +125,7 @@ #ifdef POLYMORPHIC return vminq_x(a, b, p); #else /* POLYMORPHIC */ - return vminq_x_u16(a, b, p); + return vminq_x_s16(a, b, p); #endif /* POLYMORPHIC */ } @@ -133,7 +133,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vminq_x_s32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vmulhq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vmulhq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vmulhq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -98,7 +98,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -114,7 +114,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -130,7 +130,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vmulhq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vmullbq_int_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vmullbq_int_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vmullbq_int_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // int32x4_t test_vmullbq_int_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vmullbq_int_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0) // CHECK-NEXT: ret <2 x i64> [[TMP0]] // uint64x2_t test_vmullbq_int_u32(uint32x4_t a, uint32x4_t b) @@ -64,7 +64,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -80,7 +80,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -96,7 +96,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vmullbq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -128,7 +128,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, <16 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -136,7 +136,7 @@ #ifdef POLYMORPHIC return vmullbq_int_x(a, b, p); #else /* POLYMORPHIC */ - return vmullbq_int_x_s8(a, b, p); + return vmullbq_int_x_u8(a, b, p); #endif /* POLYMORPHIC */ } @@ -144,7 +144,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, <8 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -152,7 +152,7 @@ #ifdef POLYMORPHIC return vmullbq_int_x(a, b, p); #else /* POLYMORPHIC */ - return vmullbq_int_x_u16(a, b, p); + return vmullbq_int_x_s16(a, b, p); #endif /* POLYMORPHIC */ } @@ -160,7 +160,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, <4 x i1> [[TMP1]], <2 x i64> undef) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) @@ -168,7 +168,7 @@ #ifdef POLYMORPHIC return vmullbq_int_x(a, b, p); #else /* POLYMORPHIC */ - return vmullbq_int_x_s32(a, b, p); + return vmullbq_int_x_u32(a, b, p); #endif /* POLYMORPHIC */ } diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vmulltq_int_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vmulltq_int_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vmulltq_int_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // int32x4_t test_vmulltq_int_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vmulltq_int_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1) // CHECK-NEXT: ret <2 x i64> [[TMP0]] // uint64x2_t test_vmulltq_int_u32(uint32x4_t a, uint32x4_t b) @@ -64,7 +64,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -80,7 +80,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -96,7 +96,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vmulltq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -128,7 +128,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, <16 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -144,7 +144,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, <8 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -160,7 +160,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, <4 x i1> [[TMP1]], <2 x i64> undef) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vmulltq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vqaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vqaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vqsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vqsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vrhaddq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vrhaddq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vrhaddq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vrhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vrhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -98,7 +98,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -114,10 +114,10 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // -int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p) +uint16x8_t test_vrhaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) { #ifdef POLYMORPHIC return vrhaddq_x(a, b, p); @@ -130,7 +130,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vrhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c @@ -6,7 +6,7 @@ // CHECK-LABEL: @test_vrmulhq_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b) @@ -20,7 +20,7 @@ // CHECK-LABEL: @test_vrmulhq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b) @@ -34,7 +34,7 @@ // CHECK-LABEL: @test_vrmulhq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b) @@ -50,7 +50,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) @@ -66,7 +66,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p) @@ -82,7 +82,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -98,7 +98,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef) // CHECK-NEXT: ret <16 x i8> [[TMP2]] // uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) @@ -114,7 +114,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef) // CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) @@ -130,7 +130,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef) // CHECK-NEXT: ret <4 x i32> [[TMP2]] // uint32x4_t test_vrmulhq_m_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -802,14 +802,16 @@ } def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_max_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_abd_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; @@ -835,40 +837,42 @@ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_mulh_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_qdmulh_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_rmulh_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_qrdmulh_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_mull_int_predicated: Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty, - LLVMMatchType<0>], + [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */, + llvm_i32_ty /* top */, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_mull_poly_predicated: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_qadd_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_hadd_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_rhadd_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_qsub_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_hsub_predicated: Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, + llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; @@ -962,7 +966,8 @@ def int_arm_mve_vabd: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vadc: Intrinsic< [llvm_anyvector_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>; @@ -972,28 +977,34 @@ llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; def int_arm_mve_vmulh: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vqdmulh: Intrinsic< [llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_vhadd: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vrhadd: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vhsub: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vrmulh: Intrinsic< [llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], + [IntrNoMem]>; def int_arm_mve_vqrdmulh: Intrinsic< [llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_vmull: Intrinsic< [llvm_anyvector_ty], - [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>; + [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */, + llvm_i32_ty /* top */], [IntrNoMem]>; def int_arm_mve_vmull_poly: Intrinsic< [llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -320,6 +320,9 @@ // The suffix used on an instruction that mentions the whole type. string Suffix = suffixletter ## BitsSuffix; + + // The letter part of the suffix only. + string SuffixLetter = suffixletter; } // Integer vector types that don't treat signed and unsigned differently. @@ -1090,12 +1093,12 @@ (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; - def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), + def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), ARMVCCThen, (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; - def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), + def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), ARMVCCThen, (v8i1 VCCR:$mask), @@ -1111,12 +1114,12 @@ def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), - (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), ARMVCCThen, (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), - (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), ARMVCCThen, (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive)))>; @@ -1149,7 +1152,8 @@ // Predicated min/max def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -1771,14 +1775,15 @@ def "" : MVE_VQADD_; let Predicates = [HasMVEInt] in { - // Unpredicated add-and-divide-by-two + // Unpredicated saturating add def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - // Predicated add-and-divide-by-two + // Predicated saturating add def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -1801,14 +1806,15 @@ def "" : MVE_VQSUB_; let Predicates = [HasMVEInt] in { - // Unpredicated subtract-and-divide-by-two + // Unpredicated saturating subtract def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - // Predicated subtract-and-divide-by-two + // Predicated saturating subtract def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -1845,13 +1851,15 @@ let Predicates = [HasMVEInt] in { // Unpredicated absolute difference - def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated absolute difference def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -1887,13 +1895,15 @@ let Predicates = [HasMVEInt] in { // Unpredicated rounding add-with-divide-by-two - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated add-with-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -1939,12 +1949,12 @@ let Predicates = [HasMVEInt] in { // Unpredicated add-and-divide-by-two - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated add-and-divide-by-two - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), @@ -1969,13 +1979,15 @@ let Predicates = [HasMVEInt] in { // Unpredicated subtract-and-divide-by-two - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated subtract-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -3318,11 +3330,13 @@ def "" : MVE_VABD_fp; let Predicates = [HasMVEFloat] in { - def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 0))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 0), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -3976,17 +3990,22 @@ def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned, VTI.Size, Top, cstr>; + foreach uflag = [!if(!eq(VTI.SuffixLetter, "p"), + (?), (? (i32 VTI.Unsigned)))] in let Predicates = [HasMVEInt] in { + // Unpredicated multiply - def : Pat<(VTI.DblVec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (i32 Top))), + def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn)), + uflag, (? (i32 Top)))), (VTI.DblVec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated multiply - def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (i32 Top), - (VTI.Pred VCCR:$mask), (VTI.DblVec MQPR:$inactive))), + def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn)), + uflag, (? (i32 Top), (VTI.Pred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))), (VTI.DblVec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), @@ -4059,13 +4078,15 @@ let Predicates = [HasMVEInt] in { // Unpredicated multiply returning high bits - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 VTI.Unsigned))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated multiply returning high bits def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), ARMVCCThen, (VTI.Pred VCCR:$mask), diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll @@ -7,23 +7,23 @@ ; CHECK-NEXT: vabd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vabd.s32 q0, q0, q1 +; CHECK-NEXT: vabd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_f32: @@ -31,29 +31,29 @@ ; CHECK-NEXT: vabd.f16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b) + %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b, i32 0) ret <8 x half> %0 } -declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>) #1 +declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vabdq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vabdt.s16 q0, q1, q2 +; CHECK-NEXT: vabdt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_m_s8: @@ -65,13 +65,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_m_f32: @@ -83,25 +83,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive) + %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive) ret <4 x float> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 +declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vabdq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_x_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vabdt.s16 q0, q0, q1 +; CHECK-NEXT: vabdt.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -110,16 +110,16 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vabdt.s32 q0, q0, q1 +; CHECK-NEXT: vabdt.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } -declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vabdq_x_f16: @@ -131,9 +131,9 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef) + %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef) ret <8 x half> %2 } -declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #1 +declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: vhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_s16: @@ -19,23 +19,23 @@ ; CHECK-NEXT: vhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b) + %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: vhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_m_s8: @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vhaddt.s16 q0, q1, q2 +; CHECK-NEXT: vhaddt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_m_s32: @@ -83,25 +83,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhaddq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vhaddt.s8 q0, q0, q1 +; CHECK-NEXT: vhaddt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -115,7 +115,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -124,12 +124,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vhaddt.s32 q0, q0, q1 +; CHECK-NEXT: vhaddt.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vhsub.s8 q0, q0, q1 +; CHECK-NEXT: vhsub.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_s16: @@ -19,23 +19,23 @@ ; CHECK-NEXT: vhsub.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b) + %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vhsub.s32 q0, q0, q1 +; CHECK-NEXT: vhsub.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_m_s8: @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vhsubt.s16 q0, q1, q2 +; CHECK-NEXT: vhsubt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vhsubq_m_s32: @@ -83,10 +83,10 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll @@ -35,13 +35,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive) + %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive) ret <8 x half> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2 +declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2 define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmaxnmq_m_f32: @@ -53,13 +53,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive) + %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive) ret <4 x float> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2 +declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2 define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmaxnmq_x_f16: @@ -71,7 +71,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef) + %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef) ret <8 x half> %2 } @@ -85,7 +85,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef) + %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef) ret <4 x float> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll @@ -39,18 +39,18 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmaxt.s8 q0, q1, q2 +; CHECK-NEXT: vmaxt.u8 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2 -declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2 +declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2 define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vmaxq_m_s16: @@ -62,43 +62,43 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2 +declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2 define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vmaxq_m_u32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmaxt.s32 q0, q1, q2 +; CHECK-NEXT: vmaxt.u32 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2 +declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vmaxq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmaxt.s8 q0, q0, q1 +; CHECK-NEXT: vmaxt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -112,7 +112,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -121,12 +121,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmaxt.s32 q0, q0, q1 +; CHECK-NEXT: vmaxt.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll @@ -35,13 +35,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive) + %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive) ret <8 x half> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2 +declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2 define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vminnmq_m_f32: @@ -53,13 +53,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive) + %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive) ret <4 x float> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2 +declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2 define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vminnmq_x_f16: @@ -71,7 +71,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef) + %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef) ret <8 x half> %2 } @@ -85,7 +85,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef) + %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef) ret <4 x float> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll @@ -44,31 +44,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2 -declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2 +declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2 define arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vminq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmint.s16 q0, q1, q2 +; CHECK-NEXT: vmint.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2 +declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2 define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vminq_m_s32: @@ -80,25 +80,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2 +declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 define arm_aapcs_vfpcc <16 x i8> @test_vminq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 { ; CHECK-LABEL: test_vminq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmint.s8 q0, q0, q1 +; CHECK-NEXT: vmint.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -112,7 +112,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -126,7 +126,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmulh.s8 q0, q0, q1 +; CHECK-NEXT: vmulh.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_s16: @@ -19,23 +19,23 @@ ; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b) + %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmulh.s32 q0, q0, q1 +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_m_s8: @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulht.s16 q0, q1, q2 +; CHECK-NEXT: vmulht.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_m_s32: @@ -83,25 +83,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulhq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulht.s8 q0, q0, q1 +; CHECK-NEXT: vmulht.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -115,7 +115,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -124,12 +124,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulht.s32 q0, q0, q1 +; CHECK-NEXT: vmulht.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_int_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmullb.s8 q0, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0) + %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1 +declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_int_s16: @@ -19,24 +19,24 @@ ; CHECK-NEXT: vmullb.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) + %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1 +declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1 define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_int_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmullb.s32 q2, q0, q1 +; CHECK-NEXT: vmullb.u32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: - %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0) + %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0) ret <2 x i64> %0 } -declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1 +declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_poly_p16: @@ -60,31 +60,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 0, <16 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_int_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmullbt.s16 q0, q1, q2 +; CHECK-NEXT: vmullbt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 0, <8 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_int_m_s32: @@ -96,13 +96,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <4 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1 +declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_poly_m_p8: @@ -125,12 +125,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmullbt.s8 q0, q0, q1 +; CHECK-NEXT: vmullbt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0, <16 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -144,7 +144,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0, <8 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } @@ -153,13 +153,13 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmullbt.s32 q2, q0, q1 +; CHECK-NEXT: vmullbt.u32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> undef) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0, <4 x i1> %1, <2 x i64> undef) ret <2 x i64> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_int_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmullt.s8 q0, q0, q1 +; CHECK-NEXT: vmullt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) + %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1 +declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_int_s16: @@ -19,24 +19,24 @@ ; CHECK-NEXT: vmullt.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 1) + %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1 +declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1 define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_int_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmullt.s32 q2, q0, q1 +; CHECK-NEXT: vmullt.u32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: - %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) + %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1) ret <2 x i64> %0 } -declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1 +declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_poly_p16: @@ -60,31 +60,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 1, <16 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_int_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulltt.s16 q0, q1, q2 +; CHECK-NEXT: vmulltt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 1, <8 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_int_m_s32: @@ -96,13 +96,13 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 1, <4 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1 +declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_poly_m_p8: @@ -125,12 +125,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulltt.s8 q0, q0, q1 +; CHECK-NEXT: vmulltt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1, <16 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -144,7 +144,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1, <8 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } @@ -153,13 +153,13 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmulltt.s32 q2, q0, q1 +; CHECK-NEXT: vmulltt.u32 q2, q0, q1 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> undef) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1, <4 x i1> %1, <2 x i64> undef) ret <2 x i64> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2 -declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2 +declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2 define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vqaddq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vqaddt.s16 q0, q1, q2 +; CHECK-NEXT: vqaddt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2 +declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2 define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vqaddq_m_s32: @@ -83,10 +83,10 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2 +declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2 -declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2 +declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2 define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vqsubq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vqsubt.s16 q0, q1, q2 +; CHECK-NEXT: vqsubt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 -declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2 +declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2 define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vqsubq_m_s32: @@ -83,10 +83,10 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 -declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2 +declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrhadd.s8 q0, q0, q1 +; CHECK-NEXT: vrhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_s16: @@ -19,23 +19,23 @@ ; CHECK-NEXT: vrhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b) + %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrhadd.s32 q0, q0, q1 +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_m_s8: @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrhaddt.s16 q0, q1, q2 +; CHECK-NEXT: vrhaddt.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_m_s32: @@ -83,25 +83,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrhaddq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrhaddt.s8 q0, q0, q1 +; CHECK-NEXT: vrhaddt.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -110,12 +110,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrhaddt.s16 q0, q0, q1 +; CHECK-NEXT: vrhaddt.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -124,12 +124,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrhaddt.s32 q0, q0, q1 +; CHECK-NEXT: vrhaddt.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll @@ -4,14 +4,14 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_u8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrmulh.s8 q0, q0, q1 +; CHECK-NEXT: vrmulh.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b) + %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) ret <16 x i8> %0 } -declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1 define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_s16: @@ -19,23 +19,23 @@ ; CHECK-NEXT: vrmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b) + %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0) ret <8 x i16> %0 } -declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1 define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrmulh.s32 q0, q0, q1 +; CHECK-NEXT: vrmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: - %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b) + %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) ret <4 x i32> %0 } -declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1 define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_m_s8: @@ -47,31 +47,31 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive) ret <16 x i8> %2 } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1 -declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_m_u16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrmulht.s16 q0, q1, q2 +; CHECK-NEXT: vrmulht.u16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) + %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive) ret <8 x i16> %2 } declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1 -declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 +declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1 define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_m_s32: @@ -83,25 +83,25 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive) ret <4 x i32> %2 } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1 -declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vrmulhq_x_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrmulht.s8 q0, q0, q1 +; CHECK-NEXT: vrmulht.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) - %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) + %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef) ret <16 x i8> %2 } @@ -115,7 +115,7 @@ entry: %0 = zext i16 %p to i32 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) - %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) + %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef) ret <8 x i16> %2 } @@ -124,12 +124,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vrmulht.s32 q0, q0, q1 +; CHECK-NEXT: vrmulht.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) + %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef) ret <4 x i32> %2 }