diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -214,6 +214,32 @@ (IRIntBase<"maxnum", [Vector]> $a, $b)>; } +def vpselq: Intrinsic { let params = T.Usual; } +def vpselq_64: Intrinsic< + Vector, (args Vector:$t, Vector:$f, PredOf:$pred), + (bitcast (select $pred, (bitcast $t, VecOf), + (bitcast $f, VecOf)), Vector)>, + NameOverride<"vpselq"> { let params = T.All64; } + +let params = [Void], pnt = PNT_None in { + + multiclass vctp { + def "": Intrinsic (IRIntBase $val)))>; + def _m: Intrinsic (and $inpred, + (IRIntBase $val))))>; + } + defm vctp8q: vctp, "arm_mve_vctp8">; + defm vctp16q: vctp, "arm_mve_vctp16">; + defm vctp32q: vctp, "arm_mve_vctp32">; + defm vctp64q: vctp, "arm_mve_vctp64">; + + def vpnot: Intrinsic, (args unpromoted>:$pred), + (xor $pred, (u16 65535))>; + +} multiclass contiguous_load same_size, list wider> { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c @@ -0,0 +1,290 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s + +#include + +// CHECK-LABEL: @test_vctp16q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp16q(uint32_t a) +{ + return vctp16q(a); +} + +// CHECK-LABEL: @test_vctp16q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p) +{ + return vctp16q_m(a, p); +} + +// CHECK-LABEL: @test_vctp32q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp32q(uint32_t a) +{ + return vctp32q(a); +} + +// CHECK-LABEL: @test_vctp32q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p) +{ + return vctp32q_m(a, p); +} + +// CHECK-LABEL: @test_vctp64q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp64q(uint32_t a) +{ + return vctp64q(a); +} + +// CHECK-LABEL: @test_vctp64q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p) +{ + return vctp64q_m(a, p); +} + +// CHECK-LABEL: @test_vctp8q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp8q(uint32_t a) +{ + return vctp8q(a); +} + +// CHECK-LABEL: @test_vctp8q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p) +{ + return vctp8q_m(a, p); +} + +// CHECK-LABEL: @test_vpnot( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = xor i16 [[A:%.*]], -1 +// CHECK-NEXT: ret i16 [[TMP0]] +// +mve_pred16_t test_vpnot(mve_pred16_t a) +{ + return vpnot(a); +} + +// CHECK-LABEL: @test_vpselq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]] +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_f32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s64(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u64(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1208,14 +1208,16 @@ Result::Ptr V = std::make_shared(ArgNum, isa(ArgType)); - if (const auto *ST = dyn_cast(ArgType)) { - if (Promote && ST->isInteger() && ST->sizeInBits() < 32) + if (Promote) { + if (const auto *ST = dyn_cast(ArgType)) { + if (ST->isInteger() && ST->sizeInBits() < 32) + V = std::make_shared(getScalarType("u32"), V); + } else if (const auto *PT = dyn_cast(ArgType)) { V = std::make_shared(getScalarType("u32"), V); - } else if (const auto *PT = dyn_cast(ArgType)) { - V = std::make_shared(getScalarType("u32"), V); - V = std::make_shared("arm_mve_pred_i2v", - std::vector{PT}, - std::vector{V}); + V = std::make_shared("arm_mve_pred_i2v", + std::vector{PT}, + std::vector{V}); + } } return V; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4267,7 +4267,7 @@ def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; let hasSideEffects = 1 in -class MVE_VCTP size, list pattern=[]> +class MVE_VCTPInst size, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { bits<4> Rn; @@ -4285,20 +4285,22 @@ let validForTailPredication = 1; } -def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; -def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; -def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; -def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; +multiclass MVE_VCTP { + def "": MVE_VCTPInst; -let Predicates = [HasMVEInt] in { - def : Pat<(int_arm_mve_vctp8 rGPR:$Rn), - (v16i1 (MVE_VCTP8 rGPR:$Rn))>; - def : Pat<(int_arm_mve_vctp16 rGPR:$Rn), - (v8i1 (MVE_VCTP16 rGPR:$Rn))>; - def : Pat<(int_arm_mve_vctp32 rGPR:$Rn), - (v4i1 (MVE_VCTP32 rGPR:$Rn))>; + let Predicates = [HasMVEInt] in { + def : Pat<(intr rGPR:$Rn), + (VTI.Pred (!cast(NAME) rGPR:$Rn))>; + def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)), + (VTI.Pred (!cast(NAME) rGPR:$Rn, 1, VCCR:$mask))>; + } } +defm MVE_VCTP8 : MVE_VCTP; +defm MVE_VCTP16 : MVE_VCTP; +defm MVE_VCTP32 : MVE_VCTP; +defm MVE_VCTP64 : MVE_VCTP; + // end of mve_qDest_rSrc // start of coproc mov diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll @@ -0,0 +1,219 @@ +; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s + +declare <16 x i1> @llvm.arm.mve.vctp8(i32) +declare <8 x i1> @llvm.arm.mve.vctp16(i32) +declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <4 x i1> @llvm.arm.mve.vctp64(i32) + +declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) +declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) +declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>) + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) { +; CHECK-LABEL: test_vctp8q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp8q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.8 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a) + %3 = and <16 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) { +; CHECK-LABEL: test_vctp16q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp16q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.16 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a) + %3 = and <8 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) { +; CHECK-LABEL: test_vctp32q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp32q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.32 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a) + %3 = and <4 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) { +; CHECK-LABEL: test_vctp64q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp64q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.64 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %3 = and <4 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = bitcast <2 x i64> %a to <4 x i32> + %3 = bitcast <2 x i64> %b to <4 x i32> + %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3 + %5 = bitcast <4 x i32> %4 to <2 x i64> + ret <2 x i64> %5 +}