diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -427,6 +427,14 @@ (extend (unzip $a, 1), DblVector, (unsignedflag Scalar))>; } +let params = [s16, u16, s32, u32] in { + def vmovnbq: Intrinsic; + def vmovntq: Intrinsic; +} + let params = T.Float in { def vrndq: Intrinsic $a)>; diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -131,6 +131,7 @@ def unzip: CGHelperFn<"VectorUnzip"> { let special_params = [IRBuilderIntParam<1, "bool">]; } +def zip: CGHelperFn<"VectorZip">; // Helper for making boolean flags in IR def i1: IRBuilderBase { @@ -187,6 +188,10 @@ // and 0 for a signed (or floating) one. def unsignedflag; +// 'bitsize' also takes a scalar type, and expands into an integer +// constant giving its size in bits. +def bitsize; + // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it // indicates that the IR generation for that intrinsic is done by handwritten // C++ and not autogenerated at all. The effect in the MVE builtin codegen diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -7067,6 +7067,19 @@ Indices); } +static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0, + llvm::Value *V1) { + // Make a shufflevector that interleaves two vectors element by element. + assert(V0->getType() == V1->getType() && "Can't zip different vector types"); + SmallVector Indices; + unsigned InputElements = V0->getType()->getVectorNumElements(); + for (unsigned i = 0; i < InputElements; i++) { + Indices.push_back(i); + Indices.push_back(i + InputElements); + } + return Builder.CreateShuffleVector(V0, V1, Indices); +} + template static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) { // MVE-specific helper function to make a vector splat of a constant such as diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c @@ -0,0 +1,199 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s +// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s +// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s + +#include + +// LE-LABEL: @test_vmovnbq_s16( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> +// LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// LE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> +// LE-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +// LE-NEXT: ret <16 x i8> [[TMP3]] +// +// BE-LABEL: @test_vmovnbq_s16( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> +// BE-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]]) +// BE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> +// BE-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +// BE-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vmovnbq_s16(int8x16_t a, int16x8_t b) +{ +#ifdef POLYMORPHIC + return vmovnbq(a, b); +#else /* POLYMORPHIC */ + return vmovnbq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovnbq_s32( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> +// LE-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> +// LE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> +// LE-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16> +// LE-NEXT: ret <8 x i16> [[TMP3]] +// +// BE-LABEL: @test_vmovnbq_s32( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> +// BE-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]]) +// BE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> +// BE-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16> +// BE-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vmovnbq_s32(int16x8_t a, int32x4_t b) +{ +#ifdef POLYMORPHIC + return vmovnbq(a, b); +#else /* POLYMORPHIC */ + return vmovnbq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovnbq_u16( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> +// LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// LE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> +// LE-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +// LE-NEXT: ret <16 x i8> [[TMP3]] +// +// BE-LABEL: @test_vmovnbq_u16( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> +// BE-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]]) +// BE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> +// BE-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +// BE-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vmovnbq_u16(uint8x16_t a, uint16x8_t b) +{ +#ifdef POLYMORPHIC + return vmovnbq(a, b); +#else /* POLYMORPHIC */ + return vmovnbq_u16(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovnbq_u32( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> +// LE-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> +// LE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> +// LE-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16> +// LE-NEXT: ret <8 x i16> [[TMP3]] +// +// BE-LABEL: @test_vmovnbq_u32( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> +// BE-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]]) +// BE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> +// BE-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16> +// BE-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vmovnbq_u32(uint16x8_t a, uint32x4_t b) +{ +#ifdef POLYMORPHIC + return vmovnbq(a, b); +#else /* POLYMORPHIC */ + return vmovnbq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovntq_s16( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16> +// LE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> +// LE-NEXT: [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8> +// LE-NEXT: ret <16 x i8> [[TMP2]] +// +// BE-LABEL: @test_vmovntq_s16( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]]) +// BE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> +// BE-NEXT: [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8> +// BE-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vmovntq_s16(int8x16_t a, int16x8_t b) +{ +#ifdef POLYMORPHIC + return vmovntq(a, b); +#else /* POLYMORPHIC */ + return vmovntq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovntq_s32( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32> +// LE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> +// LE-NEXT: [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16> +// LE-NEXT: ret <8 x i16> [[TMP2]] +// +// BE-LABEL: @test_vmovntq_s32( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]]) +// BE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> +// BE-NEXT: [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16> +// BE-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vmovntq_s32(int16x8_t a, int32x4_t b) +{ +#ifdef POLYMORPHIC + return vmovntq(a, b); +#else /* POLYMORPHIC */ + return vmovntq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovntq_u16( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16> +// LE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> +// LE-NEXT: [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8> +// LE-NEXT: ret <16 x i8> [[TMP2]] +// +// BE-LABEL: @test_vmovntq_u16( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]]) +// BE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> +// BE-NEXT: [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8> +// BE-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vmovntq_u16(uint8x16_t a, uint16x8_t b) +{ +#ifdef POLYMORPHIC + return vmovntq(a, b); +#else /* POLYMORPHIC */ + return vmovntq_u16(a, b); +#endif /* POLYMORPHIC */ +} + +// LE-LABEL: @test_vmovntq_u32( +// LE-NEXT: entry: +// LE-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32> +// LE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> +// LE-NEXT: [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16> +// LE-NEXT: ret <8 x i16> [[TMP2]] +// +// BE-LABEL: @test_vmovntq_u32( +// BE-NEXT: entry: +// BE-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]]) +// BE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> +// BE-NEXT: [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16> +// BE-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vmovntq_u32(uint16x8_t a, uint32x4_t b) +{ +#ifdef POLYMORPHIC + return vmovntq(a, b); +#else /* POLYMORPHIC */ + return vmovntq_u32(a, b); +#endif /* POLYMORPHIC */ +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1188,6 +1188,18 @@ } else { PrintFatalError("unsignedflag's argument should be a scalar type"); } + } else if (Op->getName() == "bitsize") { + if (D->getNumArgs() != 1) + PrintFatalError("bitsize should have exactly one argument"); + Record *TypeRec = cast(D->getArg(0))->getDef(); + if (!TypeRec->isSubClassOf("Type")) + PrintFatalError("bitsize's argument should be a type"); + if (const auto *ST = dyn_cast(getType(TypeRec, Param))) { + return std::make_shared(getScalarType("u32"), + ST->sizeInBits()); + } else { + PrintFatalError("bitsize's argument should be a scalar type"); + } } else { std::vector Args; for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4322,8 +4322,16 @@ (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qm), + (v8i16 (ARMvrev32 MQPR:$Qd_src)), (i32 1))), + (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qm), + (v16i8 (ARMvrev16 MQPR:$Qd_src)), (i32 1))), + (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; } + class MVE_VCVT_ff pattern=[]> : MVE_qDest_qSrc @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) { +; LE-LABEL: test_vmovnbq_s16: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnb.i16 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovnbq_s16: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.16 q2, q1 +; BE-NEXT: vrev64.8 q1, q0 +; BE-NEXT: vmovnb.i16 q1, q2 +; BE-NEXT: vrev64.8 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0) + %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> + %3 = trunc <16 x i16> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) { +; LE-LABEL: test_vmovnbq_s32: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnb.i32 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovnbq_s32: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.32 q2, q1 +; BE-NEXT: vrev64.16 q1, q0 +; BE-NEXT: vmovnb.i32 q1, q2 +; BE-NEXT: vrev64.16 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0) + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> + %3 = trunc <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) { +; LE-LABEL: test_vmovnbq_u16: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnb.i16 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovnbq_u16: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.16 q2, q1 +; BE-NEXT: vrev64.8 q1, q0 +; BE-NEXT: vmovnb.i16 q1, q2 +; BE-NEXT: vrev64.8 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0) + %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> + %3 = trunc <16 x i16> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) { +; LE-LABEL: test_vmovnbq_u32: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnb.i32 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovnbq_u32: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.32 q2, q1 +; BE-NEXT: vrev64.16 q1, q0 +; BE-NEXT: vmovnb.i32 q1, q2 +; BE-NEXT: vrev64.16 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0) + %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> + %3 = trunc <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) { +; LE-LABEL: test_vmovntq_s16: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnt.i16 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovntq_s16: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.16 q2, q1 +; BE-NEXT: vrev64.8 q1, q0 +; BE-NEXT: vmovnt.i16 q1, q2 +; BE-NEXT: vrev64.8 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a) + %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) { +; LE-LABEL: test_vmovntq_s32: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnt.i32 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovntq_s32: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.32 q2, q1 +; BE-NEXT: vrev64.16 q1, q0 +; BE-NEXT: vmovnt.i32 q1, q2 +; BE-NEXT: vrev64.16 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a) + %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) { +; LE-LABEL: test_vmovntq_u16: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnt.i16 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovntq_u16: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.16 q2, q1 +; BE-NEXT: vrev64.8 q1, q0 +; BE-NEXT: vmovnt.i16 q1, q2 +; BE-NEXT: vrev64.8 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a) + %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) { +; LE-LABEL: test_vmovntq_u32: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vmovnt.i32 q0, q1 +; LE-NEXT: bx lr +; +; BE-LABEL: test_vmovntq_u32: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vrev64.32 q2, q1 +; BE-NEXT: vrev64.16 q1, q0 +; BE-NEXT: vmovnt.i32 q1, q2 +; BE-NEXT: vrev64.16 q0, q1 +; BE-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a) + %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>) +declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)