Index: clang/include/clang/Basic/arm_cde.td =================================================================== --- clang/include/clang/Basic/arm_cde.td +++ clang/include/clang/Basic/arm_cde.td @@ -37,6 +37,13 @@ class CDEIRInt params = [], bit appendKind = 0> : IRIntBase<"arm_cde_" # name, params, appendKind>; +// Class for generating function macros in arm_cde.h: +// "#define () " +class FunctionMacro params_, string definition_> { + list params = params_; + string definition = definition_; +} + // Coprocessor immediate def imm_coproc : Immediate>; @@ -107,3 +114,77 @@ defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm), (args u32:$n, u32:$m), (args u64:$n, u64:$m), (? (bitcast $n, FScalar), (bitcast $m, FScalar))>; + +// VCX* instructions operating on Q vector registers + +def v16u8 : VecOf; + +let pnt = PNT_None, params = [u8] in +def vcx1q : CDEIntrinsic $cp, $imm)>; + +let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in { + def vcx1qa : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $imm), + Vector)>; + + def vcx2q : + CDEIntrinsic $cp, (bitcast $n, VecOf), $imm), + Vector)>; + def vcx2q_u8 : + CDEIntrinsic $cp, (bitcast $n, VecOf), $imm)>; + + def vcx2qa_impl : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $n, $imm), + Vector)>; + + def vcx3q_impl : + CDEIntrinsic $cp, (bitcast $n, v16u8), $m, $imm), + Vector)>; + def vcx3q_u8_impl : + CDEIntrinsic $cp, (bitcast $n, v16u8), $m, $imm)>; + def vcx3qa_impl : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $n, $m, + $imm), + Vector)>; +} + +// Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3 +// polymorphic paramters. +let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32], + headerOnly = 1, polymorphicOnly = 1 in +def vreinterpretq_u8 : + Intrinsic; + +// We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros +let params = [u8], polymorphicOnly = 1 in +def vreinterpretq_u8_cde : + CDEIntrinsic, + NameOverride<"vreinterpretq_u8">; + + +def vcx2qa : FunctionMacro< + ["cp", "acc", "n", "imm"], + "__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">; + +def vcx3q : FunctionMacro< + ["cp", "n", "m", "imm"], + "__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">; +def vcx3q_u8 : FunctionMacro< + ["cp", "n", "m", "imm"], + "__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">; +def vcx3qa : FunctionMacro< + ["cp", "acc", "n", "m", "imm"], + "__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), " + "__arm_vreinterpretq_u8(m), (imm))">; Index: clang/test/CodeGen/arm-cde-vec.c =================================================================== --- /dev/null +++ clang/test/CodeGen/arm-cde-vec.c @@ -0,0 +1,104 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \ +// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \ +// RUN: -target-feature +mve.fp \ +// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \ +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vcx1q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vcx1q_u8(void) { + return __arm_vcx1q_u8(0, 1111); +} + +// CHECK-LABEL: @test_vcx1qa_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vcx1qa_1(uint8x16_t acc) { + return __arm_vcx1qa(1, acc, 1112); +} + +// CHECK-LABEL: @test_vcx1qa_2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vcx1qa_2(int32x4_t acc) { + return __arm_vcx1qa(0, acc, 1113); +} + +// CHECK-LABEL: @test_vcx2q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vcx2q_u8(float16x8_t n) { + return __arm_vcx2q_u8(1, n, 111); +} + +// CHECK-LABEL: @test_vcx2q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcx2q(float32x4_t n) { + return __arm_vcx2q(1, n, 112); +} + +// CHECK-LABEL: @test_vcx2qa( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) { + return __arm_vcx2qa(0, acc, n, 113); +} + +// CHECK-LABEL: @test_vcx3q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) { + return __arm_vcx3q_u8(0, n, m, 11); +} + +// CHECK-LABEL: @test_vcx3q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) { + return __arm_vcx3q(1, n, m, 12); +} + +// CHECK-LABEL: @test_vcx3qa( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) { + return __arm_vcx3qa(1, acc, n, m, 13); +} Index: clang/test/Sema/arm-cde-immediates.c =================================================================== --- clang/test/Sema/arm-cde-immediates.c +++ clang/test/Sema/arm-cde-immediates.c @@ -103,3 +103,27 @@ __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}} __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} } + +void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) { + (void)__arm_vcx1q_u8(0, 0); + __arm_vcx1q_u8(0, a); // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}} + __arm_vcx1q_u8(0, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}} + __arm_vcx1qa(0, acc, a); // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}} + __arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}} + + (void)__arm_vcx2q_u8(0, n, 0); + __arm_vcx2q_u8(0, n, a); // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}} + __arm_vcx2q_u8(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + __arm_vcx2q(0, n, a); // expected-error {{argument to '__arm_vcx2q' must be a constant integer}} + __arm_vcx2q(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + __arm_vcx2qa(0, n, acc, a); // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}} + __arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + + (void)__arm_vcx3q_u8(0, n, m, 0); + __arm_vcx3q_u8(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}} + __arm_vcx3q_u8(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + __arm_vcx3q(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}} + __arm_vcx3q(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + __arm_vcx3qa(0, n, m, acc, a); // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}} + __arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} Index: clang/utils/TableGen/MveEmitter.cpp =================================================================== --- clang/utils/TableGen/MveEmitter.cpp +++ clang/utils/TableGen/MveEmitter.cpp @@ -1962,18 +1962,48 @@ } } +// ----------------------------------------------------------------------------- +// Class that describes an ACLE intrinsic implemented as a macro. +// +// This class is used when the intrinsic is polymorphic in 2 or 3 types, but we +// want to avoid a combinatorial explosion by reinterpreting the arguments to +// fixed types. + +class FunctionMacro { + std::vector Params; + StringRef Definition; + +public: + FunctionMacro(const Record &R); + + const std::vector &getParams() const { return Params; } + StringRef getDefinition() const { return Definition; } +}; + +FunctionMacro::FunctionMacro(const Record &R) { + Params = R.getValueAsListOfStrings("params"); + Definition = R.getValueAsString("definition"); +} + // ----------------------------------------------------------------------------- // The class used for generating arm_cde.h and related Clang bits // class CdeEmitter : public EmitterBase { + std::map FunctionMacros; + public: - CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){}; + CdeEmitter(RecordKeeper &Records); void EmitHeader(raw_ostream &OS) override; void EmitBuiltinDef(raw_ostream &OS) override; void EmitBuiltinSema(raw_ostream &OS) override; }; +CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) { + for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro")) + FunctionMacros.emplace(R->getName(), FunctionMacro(*R)); +} + void CdeEmitter::EmitHeader(raw_ostream &OS) { // Accumulate pieces of the header file that will be enabled under various // different combinations of #ifdef. The index into parts[] is one of the @@ -2051,6 +2081,16 @@ } } + for (const auto &kv : FunctionMacros) { + StringRef Name = kv.first; + const FunctionMacro &FM = kv.second; + + raw_ostream &OS = parts[MVE]; + OS << "#define " + << "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") " + << FM.getDefinition() << "\n"; + } + for (auto &part : parts) part << "\n"; Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -1291,4 +1291,20 @@ defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>; defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>; +multiclass CDEVCXVecIntrinsics args> { + def "" : Intrinsic< + [llvm_v16i8_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_v16i8_ty], + !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */], + args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>; +defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>; +defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>; + } // end TargetPrefix Index: llvm/lib/Target/ARM/ARMInstrCDE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrCDE.td +++ llvm/lib/Target/ARM/ARMInstrCDE.td @@ -581,3 +581,28 @@ (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m, imm_3b:$imm))>; } + +let Predicates = [HasCDE, HasMVEInt] in { + def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)), + (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc), + timm:$imm)), + (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, + imm_7b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n), + (v16i8 MQPR:$m), timm:$imm)), + (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + timm:$imm)), + (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; +} Index: llvm/test/CodeGen/Thumb2/cde-vec.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/cde-vec.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s + +declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg) + +define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() { +; CHECK-LABEL: test_vcx1q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, q0, #1111 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) { +; CHECK-LABEL: test_vcx1qa_1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1a p1, q0, #1112 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) { +; CHECK-LABEL: test_vcx1qa_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1a p0, q0, #1113 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x i32> %acc to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113) + %2 = bitcast <16 x i8> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) { +; CHECK-LABEL: test_vcx2q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2 p1, q0, q0, #111 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) { +; CHECK-LABEL: test_vcx2q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2 p1, q0, q0, #112 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112) + %2 = bitcast <16 x i8> %1 to <4 x float> + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) { +; CHECK-LABEL: test_vcx2qa: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2a p0, q0, q1, #113 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %acc to <16 x i8> + %1 = bitcast <2 x i64> %n to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113) + %3 = bitcast <16 x i8> %2 to <4 x float> + ret <4 x float> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) { +; CHECK-LABEL: test_vcx3q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3 p0, q0, q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %n to <16 x i8> + %1 = bitcast <4 x i32> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) { +; CHECK-LABEL: test_vcx3q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3 p1, q0, q0, q1, #12 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <2 x i64> %n to <16 x i8> + %1 = bitcast <4 x float> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12) + %3 = bitcast <16 x i8> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) { +; CHECK-LABEL: test_vcx3qa: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3a p1, q0, q1, q2, #13 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %n to <16 x i8> + %1 = bitcast <4 x float> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13) + ret <16 x i8> %2 +}