diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 885a2314d1ab..db5111e69105 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1,839 +1,846 @@ //===- IntrinsicsARM.td - Defines ARM intrinsics -----------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines all of the ARM-specific intrinsics. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // TLS let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". // A space-consuming intrinsic primarily for testing ARMConstantIslands. The // first argument is the number of bytes this "instruction" takes up, the second // and return value are essentially chains, used to force ordering during ISel. def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>]>; // 16-bit multiplications def int_arm_smulbb : GCCBuiltin<"__builtin_arm_smulbb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulbt : GCCBuiltin<"__builtin_arm_smulbt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smultb : GCCBuiltin<"__builtin_arm_smultb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smultt : GCCBuiltin<"__builtin_arm_smultt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulwb : GCCBuiltin<"__builtin_arm_smulwb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulwt : GCCBuiltin<"__builtin_arm_smulwt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Saturating Arithmetic def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative, IntrNoMem]>; def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Accumulating multiplications def int_arm_smlabb : GCCBuiltin<"__builtin_arm_smlabb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlabt : GCCBuiltin<"__builtin_arm_smlabt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlatb : GCCBuiltin<"__builtin_arm_smlatb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlatt : GCCBuiltin<"__builtin_arm_smlatt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlawb : GCCBuiltin<"__builtin_arm_smlawb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlawt : GCCBuiltin<"__builtin_arm_smlawt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Parallel 16-bit saturation def int_arm_ssat16 : GCCBuiltin<"__builtin_arm_ssat16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usat16 : GCCBuiltin<"__builtin_arm_usat16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Packing and unpacking def int_arm_sxtab16 : GCCBuiltin<"__builtin_arm_sxtab16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_sxtb16 : GCCBuiltin<"__builtin_arm_sxtb16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_uxtab16 : GCCBuiltin<"__builtin_arm_uxtab16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uxtb16 : GCCBuiltin<"__builtin_arm_uxtb16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; // Parallel selection, reads the GE flags. def int_arm_sel : GCCBuiltin<"__builtin_arm_sel">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; // Parallel 8-bit addition and subtraction def int_arm_qadd8 : GCCBuiltin<"__builtin_arm_qadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsub8 : GCCBuiltin<"__builtin_arm_qsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_sadd8 : GCCBuiltin<"__builtin_arm_sadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_shadd8 : GCCBuiltin<"__builtin_arm_shadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsub8 : GCCBuiltin<"__builtin_arm_shsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_ssub8 : GCCBuiltin<"__builtin_arm_ssub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uadd8 : GCCBuiltin<"__builtin_arm_uadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_uhadd8 : GCCBuiltin<"__builtin_arm_uhadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsub8 : GCCBuiltin<"__builtin_arm_uhsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqadd8 : GCCBuiltin<"__builtin_arm_uqadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsub8 : GCCBuiltin<"__builtin_arm_uqsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_usub8 : GCCBuiltin<"__builtin_arm_usub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Sum of 8-bit absolute differences def int_arm_usad8 : GCCBuiltin<"__builtin_arm_usad8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usada8 : GCCBuiltin<"__builtin_arm_usada8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Parallel 16-bit addition and subtraction def int_arm_qadd16 : GCCBuiltin<"__builtin_arm_qadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qasx : GCCBuiltin<"__builtin_arm_qasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsax : GCCBuiltin<"__builtin_arm_qsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsub16 : GCCBuiltin<"__builtin_arm_qsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_sadd16 : GCCBuiltin<"__builtin_arm_sadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_sasx : GCCBuiltin<"__builtin_arm_sasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_shadd16 : GCCBuiltin<"__builtin_arm_shadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shasx : GCCBuiltin<"__builtin_arm_shasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsax : GCCBuiltin<"__builtin_arm_shsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsub16 : GCCBuiltin<"__builtin_arm_shsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_ssax : GCCBuiltin<"__builtin_arm_ssax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_ssub16 : GCCBuiltin<"__builtin_arm_ssub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uadd16 : GCCBuiltin<"__builtin_arm_uadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uasx : GCCBuiltin<"__builtin_arm_uasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_uhadd16 : GCCBuiltin<"__builtin_arm_uhadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhasx : GCCBuiltin<"__builtin_arm_uhasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsax : GCCBuiltin<"__builtin_arm_uhsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsub16 : GCCBuiltin<"__builtin_arm_uhsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqadd16 : GCCBuiltin<"__builtin_arm_uqadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqasx : GCCBuiltin<"__builtin_arm_uqasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsax : GCCBuiltin<"__builtin_arm_uqsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsub16 : GCCBuiltin<"__builtin_arm_uqsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_usax : GCCBuiltin<"__builtin_arm_usax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_usub16 : GCCBuiltin<"__builtin_arm_usub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Parallel 16-bit multiplication def int_arm_smlad : GCCBuiltin<"__builtin_arm_smlad">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smladx : GCCBuiltin<"__builtin_arm_smladx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlald : GCCBuiltin<"__builtin_arm_smlald">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlaldx : GCCBuiltin<"__builtin_arm_smlaldx">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlsd : GCCBuiltin<"__builtin_arm_smlsd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlsdx : GCCBuiltin<"__builtin_arm_smlsdx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlsld : GCCBuiltin<"__builtin_arm_smlsld">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlsldx : GCCBuiltin<"__builtin_arm_smlsldx">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smuad : GCCBuiltin<"__builtin_arm_smuad">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smuadx : GCCBuiltin<"__builtin_arm_smuadx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smusd : GCCBuiltin<"__builtin_arm_smusd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smusdx : GCCBuiltin<"__builtin_arm_smusdx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Load, Store and Clear exclusive def int_arm_ldrex : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>; def int_arm_strex : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_anyptr_ty]>; def int_arm_ldaex : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>; def int_arm_stlex : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_anyptr_ty]>; def int_arm_clrex : Intrinsic<[]>; def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>; def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>; def int_arm_stlexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>; def int_arm_ldaexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>; //===----------------------------------------------------------------------===// // Data barrier instructions def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, Intrinsic<[], [llvm_i32_ty]>; def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>; def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // VFP def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">, Intrinsic<[llvm_i32_ty], [], []>; def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">, Intrinsic<[], [llvm_i32_ty], []>; def int_arm_vcvtr : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty], [IntrNoMem]>; def int_arm_vcvtru : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Coprocessor def int_arm_ldc : GCCBuiltin<"__builtin_arm_ldc">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldcl : GCCBuiltin<"__builtin_arm_ldcl">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldc2 : GCCBuiltin<"__builtin_arm_ldc2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldc2l : GCCBuiltin<"__builtin_arm_ldc2l">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc : GCCBuiltin<"__builtin_arm_stc">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stcl : GCCBuiltin<"__builtin_arm_stcl">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc2 : GCCBuiltin<"__builtin_arm_stc2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc2l : GCCBuiltin<"__builtin_arm_stc2l">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; // Move to coprocessor def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; // Move from coprocessor def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">, MSBuiltin<"_MoveFromCoprocessor">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>; def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">, MSBuiltin<"_MoveFromCoprocessor2">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>; // Coprocessor data processing def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; // Move from two registers to coprocessor def int_arm_mcrr : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>; def int_arm_mcrr2 : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>; def int_arm_mrrc : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>; def int_arm_mrrc2 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>; //===----------------------------------------------------------------------===// // CRC32 def int_arm_crc32b : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32h : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // CMSE def int_arm_cmse_tt : GCCBuiltin<"__builtin_arm_cmse_TT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_ttt : GCCBuiltin<"__builtin_arm_cmse_TTT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_tta : GCCBuiltin<"__builtin_arm_cmse_TTA">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_ttat : GCCBuiltin<"__builtin_arm_cmse_TTAT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // HINT def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>; def int_arm_dbg : Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // UND (reserved undefined sequence) def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // Advanced SIMD (NEON) // The following classes do not correspond directly to GCC builtins. class Neon_1Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; class Neon_1Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>], [IntrNoMem]>; class Neon_2Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_2Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>, LLVMExtendedType<0>], [IntrNoMem]>; class Neon_2Arg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>, LLVMTruncatedType<0>], [IntrNoMem]>; class Neon_3Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_3Arg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMTruncatedType<0>, LLVMTruncatedType<0>], [IntrNoMem]>; class Neon_1FloatArg_Intrinsic : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; class Neon_CvtFxToFP_Intrinsic : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; class Neon_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; class Neon_CvtFPtoInt_1Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; class Neon_Compare_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; // The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors. // Besides the table, VTBL has one other v8i8 argument and VTBX has two. // Overall, the classes range from 2 to 6 v8i8 arguments. class Neon_Tbl2Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl3Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl4Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl5Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl6Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; // Arithmetic ops let IntrProperties = [IntrNoMem, Commutative] in { // Vector Add. def int_arm_neon_vhadds : Neon_2Arg_Intrinsic; def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic; def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vqadds : Neon_2Arg_Intrinsic; def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic; // Vector Multiply. def int_arm_neon_vmulp : Neon_2Arg_Intrinsic; def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic; def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic; def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic; // Vector Maximum. def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxnm : Neon_2Arg_Intrinsic; // Vector Minimum. def int_arm_neon_vmins : Neon_2Arg_Intrinsic; def int_arm_neon_vminu : Neon_2Arg_Intrinsic; def int_arm_neon_vminnm : Neon_2Arg_Intrinsic; // Vector Reciprocal Step. def int_arm_neon_vrecps : Neon_2Arg_Intrinsic; // Vector Reciprocal Square Root Step. def int_arm_neon_vrsqrts : Neon_2Arg_Intrinsic; } // Vector Subtract. def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic; def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic; def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic; def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic; def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic; // Vector Absolute Compare. def int_arm_neon_vacge : Neon_Compare_Intrinsic; def int_arm_neon_vacgt : Neon_Compare_Intrinsic; // Vector Absolute Differences. def int_arm_neon_vabds : Neon_2Arg_Intrinsic; def int_arm_neon_vabdu : Neon_2Arg_Intrinsic; // Vector Pairwise Add. def int_arm_neon_vpadd : Neon_2Arg_Intrinsic; // Vector Pairwise Add Long. // Note: This is different than the other "long" NEON intrinsics because // the result vector has half as many elements as the source vector. // The source and destination vector types must be specified separately. def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; // Vector Pairwise Add and Accumulate Long. // Note: This is similar to vpaddl but the destination vector also appears // as the first argument. def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; // Vector Pairwise Maximum and Minimum. def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic; def int_arm_neon_vpmins : Neon_2Arg_Intrinsic; def int_arm_neon_vpminu : Neon_2Arg_Intrinsic; // Vector Shifts: // // The various saturating and rounding vector shift operations need to be // represented by intrinsics in LLVM, and even the basic VSHL variable shift // operation cannot be safely translated to LLVM's shift operators. VSHL can // be used for both left and right shifts, or even combinations of the two, // depending on the signs of the shift amounts. It also has well-defined // behavior for shift amounts that LLVM leaves undefined. Only basic shifts // by constants can be represented with LLVM's shift operators. // // The shift counts for these intrinsics are always vectors, even for constant // shifts, where the constant is replicated. For consistency with VSHL (and // other variable shift instructions), left shifts have positive shift counts // and right shifts have negative shift counts. This convention is also used // for constant right shift intrinsics, and to help preserve sanity, the // intrinsic names use "shift" instead of either "shl" or "shr". Where // applicable, signed and unsigned versions of the intrinsics are // distinguished with "s" and "u" suffixes. A few NEON shift instructions, // such as VQSHLU, take signed operands but produce unsigned results; these // use a "su" suffix. // Vector Shift. def int_arm_neon_vshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic; // Vector Rounding Shift. def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic; // Vector Saturating Shift. def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic; // Vector Saturating Rounding Shift. def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic; // Vector Shift and Insert. def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic; // Vector Absolute Value and Saturating Absolute Value. def int_arm_neon_vabs : Neon_1Arg_Intrinsic; def int_arm_neon_vqabs : Neon_1Arg_Intrinsic; // Vector Saturating Negate. def int_arm_neon_vqneg : Neon_1Arg_Intrinsic; // Vector Count Leading Sign/Zero Bits. def int_arm_neon_vcls : Neon_1Arg_Intrinsic; // Vector Reciprocal Estimate. def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic; // Vector Reciprocal Square Root Estimate. def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic; // Vector Conversions Between Floating-point and Integer def int_arm_neon_vcvtau : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtas : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtnu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtns : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtpu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtps : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtmu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtms : Neon_CvtFPtoInt_1Arg_Intrinsic; // Vector Conversions Between Floating-point and Fixed-point. def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic; def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic; def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic; def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic; // Vector Conversions Between Half-Precision and Single-Precision. def int_arm_neon_vcvtfp2hf : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_arm_neon_vcvthf2fp : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; // Narrowing Saturating Vector Moves. def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic; // Vector Table Lookup. // The first 1-4 arguments are the table. def int_arm_neon_vtbl1 : Neon_Tbl2Arg_Intrinsic; def int_arm_neon_vtbl2 : Neon_Tbl3Arg_Intrinsic; def int_arm_neon_vtbl3 : Neon_Tbl4Arg_Intrinsic; def int_arm_neon_vtbl4 : Neon_Tbl5Arg_Intrinsic; // Vector Table Extension. // Some elements of the destination vector may not be updated, so the original // value of that vector is passed as the first argument. The next 1-4 // arguments after that are the table. def int_arm_neon_vtbx1 : Neon_Tbl3Arg_Intrinsic; def int_arm_neon_vtbx2 : Neon_Tbl4Arg_Intrinsic; def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic; def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic; // Vector and Scalar Rounding. def int_arm_neon_vrintn : Neon_1FloatArg_Intrinsic; def int_arm_neon_vrintx : Neon_1Arg_Intrinsic; def int_arm_neon_vrinta : Neon_1Arg_Intrinsic; def int_arm_neon_vrintz : Neon_1Arg_Intrinsic; def int_arm_neon_vrintm : Neon_1Arg_Intrinsic; def int_arm_neon_vrintp : Neon_1Arg_Intrinsic; // De-interleaving vector loads from N-element structures. // Source operands are the address and alignment. def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; // Vector load N-element structure to one lane. // Source operands are: the address, the N input vectors (since only one // lane is assigned), the lane number, and the alignment. def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; // Vector load N-element structure to all lanes. // Source operands are the address and alignment. def int_arm_neon_vld2dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; // Interleaving vector stores from N-element structures. // Source operands are: the address, the N vectors, and the alignment. def int_arm_neon_vst1 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst2 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst3 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst4 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst1x2 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; def int_arm_neon_vst1x3 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; def int_arm_neon_vst1x4 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; // Vector store N-element structure from one lane. // Source operands are: the address, the N vectors, the lane number, and // the alignment. def int_arm_neon_vst2lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst3lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst4lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; // Vector bitwise select. def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; // Crypto instructions class AES_1Arg_Intrinsic : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; class AES_2Arg_Intrinsic : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; class SHA_1Arg_Intrinsic : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; class SHA_2Arg_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; class SHA_3Arg_i32_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>; class SHA_3Arg_v4i32_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,llvm_v4i32_ty], [IntrNoMem]>; def int_arm_neon_aesd : AES_2Arg_Intrinsic; def int_arm_neon_aese : AES_2Arg_Intrinsic; def int_arm_neon_aesimc : AES_1Arg_Intrinsic; def int_arm_neon_aesmc : AES_1Arg_Intrinsic; def int_arm_neon_sha1h : SHA_1Arg_Intrinsic; def int_arm_neon_sha1su1 : SHA_2Arg_Intrinsic; def int_arm_neon_sha256su0 : SHA_2Arg_Intrinsic; def int_arm_neon_sha1c : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1m : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1p : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1su0: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256h: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic; // Armv8.2-A dot product instructions class Neon_Dot_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; def int_arm_neon_udot : Neon_Dot_Intrinsic; def int_arm_neon_sdot : Neon_Dot_Intrinsic; def int_arm_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>; // GNU eabi mcount def int_arm_gnu_eabi_mcount : Intrinsic<[], [], [IntrReadMem, IntrWriteMem]>; def int_arm_mve_pred_i2v : Intrinsic< [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_pred_v2i : Intrinsic< [llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem]>; multiclass IntrinsicSignSuffix rets, list params = [], list props = [], string name = "", list sdprops = []> { def _s: Intrinsic; def _u: Intrinsic; } def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_arm_mve_sub_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; def int_arm_mve_vcvt_narrow: Intrinsic<[llvm_v8f16_ty], [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_vcvt_narrow_predicated: Intrinsic<[llvm_v8f16_ty], [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem]>; def int_arm_mve_vldr_gather_base_wb: Intrinsic< [llvm_anyvector_ty, llvm_anyvector_ty], [LLVMMatchType<1>, llvm_i32_ty], [IntrReadMem]>; def int_arm_mve_vldr_gather_base_wb_predicated: Intrinsic< [llvm_anyvector_ty, llvm_anyvector_ty], [LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty], [IntrReadMem]>; def int_arm_mve_urshrl: Intrinsic< [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_vadc: Intrinsic< [llvm_anyvector_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_vadc_predicated: Intrinsic< [llvm_anyvector_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; +def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; +def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; + +def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem]>; +def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem] +>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6fe5e5933149..59acc34906e4 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1,4756 +1,4830 @@ //===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines an instruction selector for the ARM target. // //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMTargetMachine.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "Utils/ARMBaseInfo.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; #define DEBUG_TYPE "arm-isel" static cl::opt DisableShifterOp("disable-shifter-op", cl::Hidden, cl::desc("Disable isel of shifter-op"), cl::init(false)); //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. /// namespace { class ARMDAGToDAGISel : public SelectionDAGISel { /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; public: explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. Subtarget = &MF.getSubtarget(); SelectionDAGISel::runOnMachineFunction(MF); return true; } StringRef getPassName() const override { return "ARM Instruction Selection"; } void PreprocessISelDAG() override; /// getI32Imm - Return a target constant of type i32 with the specified /// value. inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) { return CurDAG->getTargetConstant(Imm, dl, MVT::i32); } void Select(SDNode *N) override; bool hasNoVMLxHazardUse(SDNode *N) const; bool isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); bool SelectRegShifterOperand(SDValue N, SDValue &A, SDValue &B, SDValue &C, bool CheckProfitability = true); bool SelectImmShifterOperand(SDValue N, SDValue &A, SDValue &B, bool CheckProfitability = true); bool SelectShiftRegShifterOperand(SDValue N, SDValue &A, SDValue &B, SDValue &C) { // Don't apply the profitability check return SelectRegShifterOperand(N, A, B, C, false); } bool SelectShiftImmShifterOperand(SDValue N, SDValue &A, SDValue &B) { // Don't apply the profitability check return SelectImmShifterOperand(N, A, B, false); } bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out); bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) { const ConstantSDNode *CN = cast(N); Pred = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(N), MVT::i32); Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32); return true; } bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); bool SelectAddrOffsetNone(SDValue N, SDValue &Base); bool SelectAddrMode3(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); bool SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, bool FP16); bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset); bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset); bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label); // Thumb Addressing Modes: bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); bool SelectThumbAddrModeRRSext(SDValue N, SDValue &Base, SDValue &Offset); bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); template bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); template bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm); bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm, unsigned Shift); template bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); inline bool is_so_imm(unsigned Imm) const { return ARM_AM::getSOImmVal(Imm) != -1; } inline bool is_so_imm_not(unsigned Imm) const { return ARM_AM::getSOImmVal(~Imm) != -1; } inline bool is_t2_so_imm(unsigned Imm) const { return ARM_AM::getT2SOImmVal(Imm) != -1; } inline bool is_t2_so_imm_not(unsigned Imm) const { return ARM_AM::getT2SOImmVal(~Imm) != -1; } // Include the pieces autogenerated from the target description. #include "ARMGenDAGISel.inc" private: void transferMemOperands(SDNode *Src, SDNode *Dst); /// Indexed (pre/post inc/dec) load matching code for ARM. bool tryARMIndexedLoad(SDNode *N); bool tryT1IndexedLoad(SDNode *N); bool tryT2IndexedLoad(SDNode *N); bool tryMVEIndexedLoad(SDNode *N); /// SelectVLD - Select NEON load intrinsics. NumVecs should be /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for /// load/store of D registers and Q registers. void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes); /// Helper functions for setting up clusters of MVE predication operands. template void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, SDValue PredicateMask); template void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, SDValue PredicateMask, SDValue Inactive); template void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc); template void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, EVT InactiveTy); /// SelectMVE_WB - Select MVE writeback load/store intrinsics. void SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated); /// SelectMVE_LongShift - Select MVE 64-bit scalar shift intrinsics. void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate); /// SelectMVE_VADCSBC - Select MVE vector add/sub-with-carry intrinsics. void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs + /// should be 2 or 4. The opcode array specifies the instructions + /// used for 8, 16 and 32-bit lane sizes respectively, and each + /// pointer points to a set of NumVecs sub-opcodes used for the + /// different stages (e.g. VLD20 versus VLD21) of each load family. + void SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0 = nullptr, const uint16_t *QOpcodes1 = nullptr); /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); // Select special operations if node forms integer ABS pattern bool tryABSOp(SDNode *N); bool tryReadRegister(SDNode *N); bool tryWriteRegister(SDNode *N); bool tryInlineAsm(SDNode *N); void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI); void SelectCMP_SWAP(SDNode *N); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) override; // Form pairs of consecutive R, S, D, or Q registers. SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1); SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1); SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1); SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1); // Form sequences of 4 consecutive S, D, or Q registers. SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); // Get the alignment operand for a NEON VLD or VST instruction. SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector); /// Checks if N is a multiplication by a constant where we can extract out a /// power of two from the constant so that it can be used in a shift, but only /// if it simplifies the materialization of the constant. Returns true if it /// is, and assigns to PowerOfTwo the power of two that should be extracted /// out and to NewMulConst the new constant to be multiplied by. bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, unsigned &PowerOfTwo, SDValue &NewMulConst) const; /// Replace N with M in CurDAG, in a way that also ensures that M gets /// selected when N would have been selected. void replaceDAGValue(const SDValue &N, SDValue M); }; } /// isInt32Immediate - This method tests to see if the node is a 32-bit constant /// operand. If so Imm will receive the 32-bit value. static bool isInt32Immediate(SDNode *N, unsigned &Imm) { if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { Imm = cast(N)->getZExtValue(); return true; } return false; } // isInt32Immediate - This method tests to see if a constant operand. // If so Imm will receive the 32 bit value. static bool isInt32Immediate(SDValue N, unsigned &Imm) { return isInt32Immediate(N.getNode(), Imm); } // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. // If so Imm will receive the 32 bit value. static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { return N->getOpcode() == Opc && isInt32Immediate(N->getOperand(1).getNode(), Imm); } /// Check whether a particular node is a constant value representable as /// (N * Scale) where (N in [\p RangeMin, \p RangeMax). /// /// \param ScaledConstant [out] - On success, the pre-scaled constant value. static bool isScaledConstantInRange(SDValue Node, int Scale, int RangeMin, int RangeMax, int &ScaledConstant) { assert(Scale > 0 && "Invalid scale!"); // Check that this is a constant. const ConstantSDNode *C = dyn_cast(Node); if (!C) return false; ScaledConstant = (int) C->getZExtValue(); if ((ScaledConstant % Scale) != 0) return false; ScaledConstant /= Scale; return ScaledConstant >= RangeMin && ScaledConstant < RangeMax; } void ARMDAGToDAGISel::PreprocessISelDAG() { if (!Subtarget->hasV6T2Ops()) return; bool isThumb2 = Subtarget->isThumb(); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (N->getOpcode() != ISD::ADD) continue; // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with // leading zeros, followed by consecutive set bits, followed by 1 or 2 // trailing zeros, e.g. 1020. // Transform the expression to // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number // of trailing zeros of c2. The left shift would be folded as an shifter // operand of 'add' and the 'and' and 'srl' would become a bits extraction // node (UBFX). SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned And_imm = 0; if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) { if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm)) std::swap(N0, N1); } if (!And_imm) continue; // Check if the AND mask is an immediate of the form: 000.....1111111100 unsigned TZ = countTrailingZeros(And_imm); if (TZ != 1 && TZ != 2) // Be conservative here. Shifter operands aren't always free. e.g. On // Swift, left shifter operand of 1 / 2 for free but others are not. // e.g. // ubfx r3, r1, #16, #8 // ldr.w r3, [r0, r3, lsl #2] // vs. // mov.w r9, #1020 // and.w r2, r9, r1, lsr #14 // ldr r2, [r0, r2] continue; And_imm >>= TZ; if (And_imm & (And_imm + 1)) continue; // Look for (and (srl X, c1), c2). SDValue Srl = N1.getOperand(0); unsigned Srl_imm = 0; if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) || (Srl_imm <= 2)) continue; // Make sure first operand is not a shifter operand which would prevent // folding of the left shift. SDValue CPTmp0; SDValue CPTmp1; SDValue CPTmp2; if (isThumb2) { if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1)) continue; } else { if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) || SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2)) continue; } // Now make the transformation. Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32, Srl.getOperand(0), CurDAG->getConstant(Srl_imm + TZ, SDLoc(Srl), MVT::i32)); N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32, Srl, CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32)); N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32, N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32)); CurDAG->UpdateNodeOperands(N, N0, N1); } } /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS /// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at /// least on current ARM implementations) which should be avoidded. bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (OptLevel == CodeGenOpt::None) return true; if (!Subtarget->hasVMLxHazards()) return true; if (!N->hasOneUse()) return false; SDNode *Use = *N->use_begin(); if (Use->getOpcode() == ISD::CopyToReg) return true; if (Use->isMachineOpcode()) { const ARMBaseInstrInfo *TII = static_cast( CurDAG->getSubtarget().getInstrInfo()); const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); if (MCID.mayStore()) return true; unsigned Opcode = MCID.getOpcode(); if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) return true; // vmlx feeding into another vmlx. We actually want to unfold // the use later in the MLxExpansion pass. e.g. // vmla // vmla (stall 8 cycles) // // vmul (5 cycles) // vadd (5 cycles) // vmla // This adds up to about 18 - 19 cycles. // // vmla // vmul (stall 4 cycles) // vadd adds up to about 14 cycles. return TII->isFpMLxInstruction(Opcode); } return false; } bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt) { if (!Subtarget->isLikeA9() && !Subtarget->isSwift()) return true; if (Shift.hasOneUse()) return true; // R << 2 is free. return ShOpcVal == ARM_AM::lsl && (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); } bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, unsigned &PowerOfTwo, SDValue &NewMulConst) const { assert(N.getOpcode() == ISD::MUL); assert(MaxShift > 0); // If the multiply is used in more than one place then changing the constant // will make other uses incorrect, so don't. if (!N.hasOneUse()) return false; // Check if the multiply is by a constant ConstantSDNode *MulConst = dyn_cast(N.getOperand(1)); if (!MulConst) return false; // If the constant is used in more than one place then modifying it will mean // we need to materialize two constants instead of one, which is a bad idea. if (!MulConst->hasOneUse()) return false; unsigned MulConstVal = MulConst->getZExtValue(); if (MulConstVal == 0) return false; // Find the largest power of 2 that MulConstVal is a multiple of PowerOfTwo = MaxShift; while ((MulConstVal % (1 << PowerOfTwo)) != 0) { --PowerOfTwo; if (PowerOfTwo == 0) return false; } // Only optimise if the new cost is better unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget); unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget); return NewCost < OldCost; } void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); ReplaceUses(N, M); } bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, SDValue &BaseReg, SDValue &Opc, bool CheckProfitability) { if (DisableShifterOp) return false; // If N is a multiply-by-constant and it's profitable to extract a shift and // use it in a shifted operand do so. if (N.getOpcode() == ISD::MUL) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { HandleSDNode Handle(N); SDLoc Loc(N); replaceDAGValue(N.getOperand(1), NewMulConst); BaseReg = Handle.getValue(); Opc = CurDAG->getTargetConstant( ARM_AM::getSORegOpc(ARM_AM::lsl, PowerOfTwo), Loc, MVT::i32); return true; } } ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); // Don't match base register only case. That is matched to a separate // lower complexity pattern with explicit register operand. if (ShOpcVal == ARM_AM::no_shift) return false; BaseReg = N.getOperand(0); unsigned ShImmVal = 0; ConstantSDNode *RHS = dyn_cast(N.getOperand(1)); if (!RHS) return false; ShImmVal = RHS->getZExtValue() & 31; Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N, SDValue &BaseReg, SDValue &ShReg, SDValue &Opc, bool CheckProfitability) { if (DisableShifterOp) return false; ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); // Don't match base register only case. That is matched to a separate // lower complexity pattern with explicit register operand. if (ShOpcVal == ARM_AM::no_shift) return false; BaseReg = N.getOperand(0); unsigned ShImmVal = 0; ConstantSDNode *RHS = dyn_cast(N.getOperand(1)); if (RHS) return false; ShReg = N.getOperand(1); if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal)) return false; Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N), MVT::i32); return true; } // Determine whether an ISD::OR's operands are suitable to turn the operation // into an addition, which often has more compact encodings. bool ARMDAGToDAGISel::SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out) { assert(Parent->getOpcode() == ISD::OR && "unexpected parent"); Out = N; return CurDAG->haveNoCommonBitsSet(N, Parent->getOperand(1)); } bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. // Base only. if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && !CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { int RHSC = (int)RHS->getSExtValue(); if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; if (RHSC > -0x1000 && RHSC < 0x1000) { // 12 bits Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } // Base only. Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc) { if (N.getOpcode() == ISD::MUL && ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) { if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { // X * [3,5,9] -> X + X * [2,4,8] etc. int RHSC = (int)RHS->getZExtValue(); if (RHSC & 1) { RHSC = RHSC & ~1; ARM_AM::AddrOpc AddSub = ARM_AM::add; if (RHSC < 0) { AddSub = ARM_AM::sub; RHSC = - RHSC; } if (isPowerOf2_32(RHSC)) { unsigned ShAmt = Log2_32(RHSC); Base = Offset = N.getOperand(0); Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ARM_AM::lsl), SDLoc(N), MVT::i32); return true; } } } } if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && // ISD::OR that is equivalent to an ISD::ADD. !CurDAG->isBaseWithConstantOffset(N)) return false; // Leave simple R +/- imm12 operands for LDRi12 if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) { int RHSC; if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, -0x1000+1, 0x1000, RHSC)) // 12 bits. return false; } // Otherwise this is R +/- [possibly shifted] R. ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add; ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode()); unsigned ShAmt = 0; Base = N.getOperand(0); Offset = N.getOperand(1); if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't fold // it. if (ConstantSDNode *Sh = dyn_cast(N.getOperand(1).getOperand(1))) { ShAmt = Sh->getZExtValue(); if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) Offset = N.getOperand(1).getOperand(0); else { ShAmt = 0; ShOpcVal = ARM_AM::no_shift; } } else { ShOpcVal = ARM_AM::no_shift; } } // Try matching (R shl C) + (R). if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && !(Subtarget->isLikeA9() || Subtarget->isSwift() || N.getOperand(0).hasOneUse())) { ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't // fold it. if (ConstantSDNode *Sh = dyn_cast(N.getOperand(0).getOperand(1))) { ShAmt = Sh->getZExtValue(); if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) { Offset = N.getOperand(0).getOperand(0); Base = N.getOperand(1); } else { ShAmt = 0; ShOpcVal = ARM_AM::no_shift; } } else { ShOpcVal = ARM_AM::no_shift; } } } // If Offset is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { HandleSDNode Handle(Offset); replaceDAGValue(Offset.getOperand(1), NewMulConst); Offset = Handle.getValue(); ShAmt = PowerOfTwo; ShOpcVal = ARM_AM::lsl; } } Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc) { unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; int Val; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) return false; Offset = N; ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); unsigned ShAmt = 0; if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't fold // it. if (ConstantSDNode *Sh = dyn_cast(N.getOperand(1))) { ShAmt = Sh->getZExtValue(); if (isShifterOpProfitable(N, ShOpcVal, ShAmt)) Offset = N.getOperand(0); else { ShAmt = 0; ShOpcVal = ARM_AM::no_shift; } } else { ShOpcVal = ARM_AM::no_shift; } } Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc) { unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; int Val; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. if (AddSub == ARM_AM::sub) Val *= -1; Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(Val, SDLoc(Op), MVT::i32); return true; } return false; } bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc) { unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; int Val; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift), SDLoc(Op), MVT::i32); return true; } return false; } bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) { Base = N; return true; } bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc) { if (N.getOpcode() == ISD::SUB) { // X - C is canonicalize to X + -C, no need to handle it here. Base = N.getOperand(0); Offset = N.getOperand(1); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0), SDLoc(N), MVT::i32); return true; } if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N), MVT::i32); return true; } // If the RHS is +/- imm8, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, -256 + 1, 256, RHSC)) { // 8 bits. Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } Offset = CurDAG->getRegister(0, MVT::i32); ARM_AM::AddrOpc AddSub = ARM_AM::add; if (RHSC < 0) { AddSub = ARM_AM::sub; RHSC = -RHSC; } Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC), SDLoc(N), MVT::i32); return true; } Base = N.getOperand(0); Offset = N.getOperand(1); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc) { unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) ? ARM_AM::add : ARM_AM::sub; int Val; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits. Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), SDLoc(Op), MVT::i32); return true; } Offset = N; Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), SDLoc(Op), MVT::i32); return true; } bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, bool FP16) { if (!CurDAG->isBaseWithConstantOffset(N)) { Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), SDLoc(N), MVT::i32); return true; } // If the RHS is +/- imm8, fold into addr mode. int RHSC; const int Scale = FP16 ? 2 : 4; if (isScaledConstantInRange(N.getOperand(1), Scale, -255, 256, RHSC)) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } ARM_AM::AddrOpc AddSub = ARM_AM::add; if (RHSC < 0) { AddSub = ARM_AM::sub; RHSC = -RHSC; } if (FP16) Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC), SDLoc(N), MVT::i32); else Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), SDLoc(N), MVT::i32); return true; } Base = N; if (FP16) Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0), SDLoc(N), MVT::i32); else Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset) { return IsAddressingMode5(N, Base, Offset, /*FP16=*/ false); } bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset) { return IsAddressingMode5(N, Base, Offset, /*FP16=*/ true); } bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, SDValue &Align) { Addr = N; unsigned Alignment = 0; MemSDNode *MemN = cast(Parent); if (isa(MemN) || ((MemN->getOpcode() == ARMISD::VST1_UPD || MemN->getOpcode() == ARMISD::VLD1_UPD) && MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) { // This case occurs only for VLD1-lane/dup and VST1-lane instructions. // The maximum alignment is equal to the memory size being referenced. unsigned MMOAlign = MemN->getAlignment(); unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8; if (MMOAlign >= MemSize && MemSize > 1) Alignment = MemSize; } else { // All other uses of addrmode6 are for intrinsics. For now just record // the raw alignment value; it will be refined later based on the legal // alignment operands for the intrinsic. Alignment = MemN->getAlignment(); } Align = CurDAG->getTargetConstant(Alignment, SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset) { LSBaseSDNode *LdSt = cast(Op); ISD::MemIndexedMode AM = LdSt->getAddressingMode(); if (AM != ISD::POST_INC) return false; Offset = N; if (ConstantSDNode *NC = dyn_cast(N)) { if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits()) Offset = CurDAG->getRegister(0, MVT::i32); } return true; } bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label) { if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { Offset = N.getOperand(0); SDValue N1 = N.getOperand(1); Label = CurDAG->getTargetConstant(cast(N1)->getZExtValue(), SDLoc(N), MVT::i32); return true; } return false; } //===----------------------------------------------------------------------===// // Thumb Addressing Modes //===----------------------------------------------------------------------===// static bool shouldUseZeroOffsetLdSt(SDValue N) { // Negative numbers are difficult to materialise in thumb1. If we are // selecting the add of a negative, instead try to select ri with a zero // offset, so create the add node directly which will become a sub. if (N.getOpcode() != ISD::ADD) return false; // Look for an imm which is not legal for ld/st, but is legal for sub. if (auto C = dyn_cast(N.getOperand(1))) return C->getSExtValue() < 0 && C->getSExtValue() >= -255; return false; } bool ARMDAGToDAGISel::SelectThumbAddrModeRRSext(SDValue N, SDValue &Base, SDValue &Offset) { if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) { ConstantSDNode *NC = dyn_cast(N); if (!NC || !NC->isNullValue()) return false; Base = Offset = N; return true; } Base = N.getOperand(0); Offset = N.getOperand(1); return true; } bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset) { if (shouldUseZeroOffsetLdSt(N)) return false; // Select ri instead return SelectThumbAddrModeRRSext(N, Base, Offset); } bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm) { if (shouldUseZeroOffsetLdSt(N)) { Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (!CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ISD::ADD) { return false; // We want to select register offset instead } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetConstantPool && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else { Base = N; } OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } // If the RHS is + imm5 * scale, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { Base = N.getOperand(0); OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } // Offset is too large, so use register offset instead. return false; } bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectThumbAddrModeImm5S(N, 4, Base, OffImm); } bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectThumbAddrModeImm5S(N, 2, Base, OffImm); } bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectThumbAddrModeImm5S(N, 1, Base, OffImm); } bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); // Only multiples of 4 are allowed for the offset, so the frame object // alignment must be at least 4. MachineFrameInfo &MFI = MF->getFrameInfo(); if (MFI.getObjectAlignment(FI) < 4) MFI.setObjectAlignment(FI, 4); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (!CurDAG->isBaseWithConstantOffset(N)) return false; if (N.getOperand(0).getOpcode() == ISD::FrameIndex) { // If the RHS is + imm8 * scale, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { Base = N.getOperand(0); int FI = cast(Base)->getIndex(); // Make sure the offset is inside the object, or we might fail to // allocate an emergency spill slot. (An out-of-range access is UB, but // it could show up anyway.) MachineFrameInfo &MFI = MF->getFrameInfo(); if (RHSC * 4 < MFI.getObjectSize(FI)) { // For LHS+RHS to result in an offset that's a multiple of 4 the object // indexed by the LHS must be 4-byte aligned. if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4) MFI.setObjectAlignment(FI, 4); if (MFI.getObjectAlignment(FI) >= 4) { Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } } } return false; } template bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { int RHSC; if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, RHSC)) { Base = N.getOperand(0); if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; OffImm = CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); return true; } } // Base only. Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } //===----------------------------------------------------------------------===// // Thumb 2 Addressing Modes //===----------------------------------------------------------------------===// bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. // Base only. if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && !CurDAG->isBaseWithConstantOffset(N)) { if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::TargetConstantPool) return false; // We want to select t2LDRpci instead. } else Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { if (SelectT2AddrModeImm8(N, Base, OffImm)) // Let t2LDRi8 handle (R - imm8). return false; int RHSC = (int)RHS->getZExtValue(); if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } // Base only. Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R - imm8 operands. if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && !CurDAG->isBaseWithConstantOffset(N)) return false; if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { int RHSC = (int)RHS->getSExtValue(); if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } return false; } bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm){ unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); int RHSC; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits. OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) ? CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32) : CurDAG->getTargetConstant(-RHSC, SDLoc(N), MVT::i32); return true; } return false; } template bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { int RHSC; if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, RHSC)) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; OffImm = CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); return true; } } // Base only. Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } template bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm) { return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift); } bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm, unsigned Shift) { unsigned Opcode = Op->getOpcode(); ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) ? cast(Op)->getAddressingMode() : cast(Op)->getAddressingMode(); int RHSC; if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits. OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32) : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N), MVT::i32); return true; } return false; } bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) return false; // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned) return false; else if (RHSC < 0 && RHSC >= -255) // 8 bits return false; } // Look for (R + R) or (R + (R << [1,2,3])). unsigned ShAmt = 0; Base = N.getOperand(0); OffReg = N.getOperand(1); // Swap if it is ((R << c) + R). ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode()); if (ShOpcVal != ARM_AM::lsl) { ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode()); if (ShOpcVal == ARM_AM::lsl) std::swap(Base, OffReg); } if (ShOpcVal == ARM_AM::lsl) { // Check to see if the RHS of the shift is a constant, if not, we can't fold // it. if (ConstantSDNode *Sh = dyn_cast(OffReg.getOperand(1))) { ShAmt = Sh->getZExtValue(); if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt)) OffReg = OffReg.getOperand(0); else { ShAmt = 0; } } } // If OffReg is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { HandleSDNode Handle(OffReg); replaceDAGValue(OffReg.getOperand(1), NewMulConst); OffReg = Handle.getValue(); ShAmt = PowerOfTwo; } } ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32); return true; } bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm) { // This *must* succeed since it's used for the irreplaceable ldrex and strex // instructions. Base = N; OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N)) return true; ConstantSDNode *RHS = dyn_cast(N.getOperand(1)); if (!RHS) return true; uint32_t RHSC = (int)RHS->getZExtValue(); if (RHSC > 1020 || RHSC % 4 != 0) return true; Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32); return true; } //===--------------------------------------------------------------------===// /// getAL - Returns a ARMCC::AL immediate node. static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) { return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32); } void ARMDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(Result), {MemOp}); } bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM == ISD::UNINDEXED) return false; EVT LoadedVT = LD->getMemoryVT(); SDValue Offset, AMOpc; bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); unsigned Opcode = 0; bool Match = false; if (LoadedVT == MVT::i32 && isPre && SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { Opcode = ARM::LDR_PRE_IMM; Match = true; } else if (LoadedVT == MVT::i32 && !isPre && SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { Opcode = ARM::LDR_POST_IMM; Match = true; } else if (LoadedVT == MVT::i32 && SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG; Match = true; } else if (LoadedVT == MVT::i16 && SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { Match = true; Opcode = (LD->getExtensionType() == ISD::SEXTLOAD) ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST) : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST); } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) { if (LD->getExtensionType() == ISD::SEXTLOAD) { if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { Match = true; Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST; } } else { if (isPre && SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { Match = true; Opcode = ARM::LDRB_PRE_IMM; } else if (!isPre && SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { Match = true; Opcode = ARM::LDRB_POST_IMM; } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { Match = true; Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG; } } } if (Match) { if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceNode(N, New); return true; } else { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceNode(N, New); return true; } } return false; } bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); EVT LoadedVT = LD->getMemoryVT(); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD || LoadedVT.getSimpleVT().SimpleTy != MVT::i32) return false; auto *COffs = dyn_cast(LD->getOffset()); if (!COffs || COffs->getZExtValue() != 4) return false; // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}. // The encoding of LDM is not how the rest of ISel expects a post-inc load to // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after // ISel. SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; SDNode *New = CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceNode(N, New); return true; } bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM == ISD::UNINDEXED) return false; EVT LoadedVT = LD->getMemoryVT(); bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; SDValue Offset; bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); unsigned Opcode = 0; bool Match = false; if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) { switch (LoadedVT.getSimpleVT().SimpleTy) { case MVT::i32: Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST; break; case MVT::i16: if (isSExtLd) Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST; else Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST; break; case MVT::i8: case MVT::i1: if (isSExtLd) Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST; else Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST; break; default: return false; } Match = true; } if (Match) { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceNode(N, New); return true; } return false; } bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM == ISD::UNINDEXED) return false; EVT LoadedVT = LD->getMemoryVT(); if (!LoadedVT.isVector()) return false; bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; SDValue Offset; bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); unsigned Opcode = 0; unsigned Align = LD->getAlignment(); bool IsLE = Subtarget->isLittle(); if (Align >= 2 && LoadedVT == MVT::v4i16 && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; else Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post; } else if (LoadedVT == MVT::v8i8 && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post; else Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post; } else if (LoadedVT == MVT::v4i8 && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; else Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; } else if (Align >= 4 && (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2)) Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; else if (Align >= 2 && (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post; else if ((IsLE || LoadedVT == MVT::v16i8) && SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post; else return false; SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[] = {Base, Offset, CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32), CurDAG->getRegister(0, MVT::i32), Chain}; SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0), MVT::i32, MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); ReplaceUses(SDValue(N, 1), SDValue(New, 0)); ReplaceUses(SDValue(N, 2), SDValue(New, 2)); CurDAG->RemoveDeadNode(N); return true; } /// Form a GPRPair pseudo register from a pair of GPR regs. SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form a D register from a pair of S registers. SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form a quad register from a pair of D registers. SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form 4 consecutive D registers from a pair of Q registers. SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form 4 consecutive S registers. SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, dl, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form 4 consecutive D registers. SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, dl, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// Form 4 consecutive Q registers. SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, dl, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, dl, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, dl, MVT::i32); const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); } /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand /// of a NEON VLD or VST instruction. The supported values depend on the /// number of registers being loaded. SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector) { unsigned NumRegs = NumVecs; if (!is64BitVector && NumVecs < 3) NumRegs *= 2; unsigned Alignment = cast(Align)->getZExtValue(); if (Alignment >= 32 && NumRegs == 4) Alignment = 32; else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) Alignment = 16; else if (Alignment >= 8) Alignment = 8; else Alignment = 0; return CurDAG->getTargetConstant(Alignment, dl, MVT::i32); } static bool isVLDfixed(unsigned Opc) { switch (Opc) { default: return false; case ARM::VLD1d8wb_fixed : return true; case ARM::VLD1d16wb_fixed : return true; case ARM::VLD1d64Qwb_fixed : return true; case ARM::VLD1d32wb_fixed : return true; case ARM::VLD1d64wb_fixed : return true; case ARM::VLD1d64TPseudoWB_fixed : return true; case ARM::VLD1d64QPseudoWB_fixed : return true; case ARM::VLD1q8wb_fixed : return true; case ARM::VLD1q16wb_fixed : return true; case ARM::VLD1q32wb_fixed : return true; case ARM::VLD1q64wb_fixed : return true; case ARM::VLD1DUPd8wb_fixed : return true; case ARM::VLD1DUPd16wb_fixed : return true; case ARM::VLD1DUPd32wb_fixed : return true; case ARM::VLD1DUPq8wb_fixed : return true; case ARM::VLD1DUPq16wb_fixed : return true; case ARM::VLD1DUPq32wb_fixed : return true; case ARM::VLD2d8wb_fixed : return true; case ARM::VLD2d16wb_fixed : return true; case ARM::VLD2d32wb_fixed : return true; case ARM::VLD2q8PseudoWB_fixed : return true; case ARM::VLD2q16PseudoWB_fixed : return true; case ARM::VLD2q32PseudoWB_fixed : return true; case ARM::VLD2DUPd8wb_fixed : return true; case ARM::VLD2DUPd16wb_fixed : return true; case ARM::VLD2DUPd32wb_fixed : return true; } } static bool isVSTfixed(unsigned Opc) { switch (Opc) { default: return false; case ARM::VST1d8wb_fixed : return true; case ARM::VST1d16wb_fixed : return true; case ARM::VST1d32wb_fixed : return true; case ARM::VST1d64wb_fixed : return true; case ARM::VST1q8wb_fixed : return true; case ARM::VST1q16wb_fixed : return true; case ARM::VST1q32wb_fixed : return true; case ARM::VST1q64wb_fixed : return true; case ARM::VST1d64TPseudoWB_fixed : return true; case ARM::VST1d64QPseudoWB_fixed : return true; case ARM::VST2d8wb_fixed : return true; case ARM::VST2d16wb_fixed : return true; case ARM::VST2d32wb_fixed : return true; case ARM::VST2q8PseudoWB_fixed : return true; case ARM::VST2q16PseudoWB_fixed : return true; case ARM::VST2q32PseudoWB_fixed : return true; } } // Get the register stride update opcode of a VLD/VST instruction that // is otherwise equivalent to the given fixed stride updating instruction. static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { assert((isVLDfixed(Opc) || isVSTfixed(Opc)) && "Incorrect fixed stride updating instruction."); switch (Opc) { default: break; case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register; case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register; case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register; case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register; case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register; case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register; case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register; case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register; case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register; case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register; case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register; case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register; case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register; case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register; case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register; case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register; case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register; case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register; case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register; case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register; case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register; case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register; case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register; case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register; case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register; case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register; case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register; case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register; case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register; case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register; case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register; case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register; case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register; case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register; case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register; case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register; case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register; case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register; case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register; case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register; case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register; case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register; case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register; } return Opc; // If not one we handle, return it unchanged. } /// Returns true if the given increment is a Constant known to be equal to the /// access size performed by a NEON load/store. This means the "[rN]!" form can /// be used. static bool isPerfectIncrement(SDValue Inc, EVT VecTy, unsigned NumVecs) { auto C = dyn_cast(Inc); return C && C->getZExtValue() == VecTy.getSizeInBits() / 8 * NumVecs; } void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating // nodes are not intrinsics. unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); bool is64BitVector = VT.is64BitVector(); Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v2f64: case MVT::v2i64: OpcodeIndex = 3; break; } EVT ResTy; if (NumVecs == 1) ResTy = VT; else { unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; if (!is64BitVector) ResTyElts *= 2; ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); } std::vector ResTys; ResTys.push_back(ResTy); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SDNode *VLd; SmallVector Ops; // Double registers and VLD1/VLD2 quad registers are directly supported. if (is64BitVector || NumVecs <= 2) { unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes0[OpcodeIndex]); Ops.push_back(MemAddr); Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); if (!IsImmUpdate) { // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so // check for the opcode rather than the number of vector elements. if (isVLDfixed(Opc)) Opc = getVLDSTRegisterUpdateOpcode(Opc); Ops.push_back(Inc); // VLD1/VLD2 fixed increment does not need Reg0 so only include it in // the operands if not such an opcode. } else if (!isVLDfixed(Opc)) Ops.push_back(Reg0); } Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); } else { // Otherwise, quad registers are loaded with two separate instructions, // where one loads the even registers and the other loads the odd registers. EVT AddrTy = MemAddr.getValueType(); // Load the even subregs. This is always an updating load, so that it // provides the address to the second load for the odd subregs. SDValue ImplDef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy, AddrTy, MVT::Other, OpsA); Chain = SDValue(VLdA, 2); // Load the odd subregs. Ops.push_back(SDValue(VLdA, 1)); Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); assert(isa(Inc.getNode()) && "only constant post-increment update allowed for VLD3/4"); (void)Inc; Ops.push_back(Reg0); } Ops.push_back(SDValue(VLdA, 0)); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); } // Transfer memoperands. MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(VLd), {MemOp}); if (NumVecs == 1) { ReplaceNode(N, VLd); return; } // Extract out the subregisters. SDValue SuperReg = SDValue(VLd, 0); static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 && ARM::qsub_3 == ARM::qsub_0 + 3, "Unexpected subreg numbering"); unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); CurDAG->RemoveDeadNode(N); } void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating // nodes are not intrinsics. unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; MachineMemOperand *MemOp = cast(N)->getMemOperand(); SDValue Chain = N->getOperand(0); EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vst type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v2f64: case MVT::v2i64: OpcodeIndex = 3; break; } std::vector ResTys; if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SmallVector Ops; // Double registers and VST1/VST2 quad registers are directly supported. if (is64BitVector || NumVecs <= 2) { SDValue SrcReg; if (NumVecs == 1) { SrcReg = N->getOperand(Vec0Idx); } else if (is64BitVector) { // Form a REG_SEQUENCE to force register allocation. SDValue V0 = N->getOperand(Vec0Idx + 0); SDValue V1 = N->getOperand(Vec0Idx + 1); if (NumVecs == 2) SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0); else { SDValue V2 = N->getOperand(Vec0Idx + 2); // If it's a vst3, form a quad D-register and leave the last part as // an undef. SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) : N->getOperand(Vec0Idx + 3); SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); } } else { // Form a QQ register. SDValue Q0 = N->getOperand(Vec0Idx); SDValue Q1 = N->getOperand(Vec0Idx + 1); SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0); } unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes0[OpcodeIndex]); Ops.push_back(MemAddr); Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); if (!IsImmUpdate) { // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so // check for the opcode rather than the number of vector elements. if (isVSTfixed(Opc)) Opc = getVLDSTRegisterUpdateOpcode(Opc); Ops.push_back(Inc); } // VST1/VST2 fixed increment does not need Reg0 so only include it in // the operands if not such an opcode. else if (!isVSTfixed(Opc)) Ops.push_back(Reg0); } Ops.push_back(SrcReg); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); // Transfer memoperands. CurDAG->setNodeMemRefs(cast(VSt), {MemOp}); ReplaceNode(N, VSt); return; } // Otherwise, quad registers are stored with two separate instructions, // where one stores the even registers and the other stores the odd registers. // Form the QQQQ REG_SEQUENCE. SDValue V0 = N->getOperand(Vec0Idx + 0); SDValue V1 = N->getOperand(Vec0Idx + 1); SDValue V2 = N->getOperand(Vec0Idx + 2); SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) : N->getOperand(Vec0Idx + 3); SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0); // Store the even D registers. This is always an updating store, so that it // provides the address to the second store for the odd subregs. const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), MVT::Other, OpsA); CurDAG->setNodeMemRefs(cast(VStA), {MemOp}); Chain = SDValue(VStA, 1); // Store the odd D registers. Ops.push_back(SDValue(VStA, 0)); Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); assert(isa(Inc.getNode()) && "only constant post-increment update allowed for VST3/4"); (void)Inc; Ops.push_back(Reg0); } Ops.push_back(RegSeq); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); CurDAG->setNodeMemRefs(cast(VStB), {MemOp}); ReplaceNode(N, VStB); } void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating // nodes are not intrinsics. unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; MachineMemOperand *MemOp = cast(N)->getMemOperand(); SDValue Chain = N->getOperand(0); unsigned Lane = cast(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); unsigned Alignment = 0; if (NumVecs != 3) { Alignment = cast(Align)->getZExtValue(); unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8; if (Alignment > NumBytes) Alignment = NumBytes; if (Alignment < 8 && Alignment < NumBytes) Alignment = 0; // Alignment must be a power of two; make sure of that. Alignment = (Alignment & -Alignment); if (Alignment == 1) Alignment = 0; } Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld/vst lane type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; } std::vector ResTys; if (IsLoad) { unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; if (!is64BitVector) ResTyElts *= 2; ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts)); } if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SmallVector Ops; Ops.push_back(MemAddr); Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); bool IsImmUpdate = isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); Ops.push_back(IsImmUpdate ? Reg0 : Inc); } SDValue SuperReg; SDValue V0 = N->getOperand(Vec0Idx + 0); SDValue V1 = N->getOperand(Vec0Idx + 1); if (NumVecs == 2) { if (is64BitVector) SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0); else SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0); } else { SDValue V2 = N->getOperand(Vec0Idx + 2); SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) : N->getOperand(Vec0Idx + 3); if (is64BitVector) SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); else SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0); } Ops.push_back(SuperReg); Ops.push_back(getI32Imm(Lane, dl)); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes[OpcodeIndex]); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); CurDAG->setNodeMemRefs(cast(VLdLn), {MemOp}); if (!IsLoad) { ReplaceNode(N, VLdLn); return; } // Extract the subregisters. SuperReg = SDValue(VLdLn, 0); static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 && ARM::qsub_3 == ARM::qsub_0 + 3, "Unexpected subreg numbering"); unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); CurDAG->RemoveDeadNode(N); } template void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, SDValue PredicateMask) { Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32)); Ops.push_back(PredicateMask); } template void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, SDValue PredicateMask, SDValue Inactive) { Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32)); Ops.push_back(PredicateMask); Ops.push_back(Inactive); } template void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc) { Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); } template void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, EVT InactiveTy) { Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, InactiveTy), 0)); } void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated) { SDLoc Loc(N); SmallVector Ops; uint16_t Opcode; switch (N->getValueType(1).getVectorElementType().getSizeInBits()) { case 32: Opcode = Opcodes[0]; break; case 64: Opcode = Opcodes[1]; break; default: llvm_unreachable("bad vector element size in SelectMVE_WB"); } Ops.push_back(N->getOperand(2)); // vector of base addresses int32_t ImmValue = cast(N->getOperand(3))->getZExtValue(); Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset if (Predicated) AddMVEPredicateToOps(Ops, Loc, N->getOperand(4)); else AddEmptyMVEPredicateToOps(Ops, Loc); Ops.push_back(N->getOperand(0)); // chain CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate) { SDLoc Loc(N); SmallVector Ops; // Two 32-bit halves of the value to be shifted Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(2)); // The shift count if (Immediate) { int32_t ImmValue = cast(N->getOperand(3))->getZExtValue(); Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset } else { Ops.push_back(N->getOperand(3)); } // MVE scalar shifts are IT-predicable, so include the standard // predicate arguments. Ops.push_back(getAL(CurDAG, Loc)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated) { SDLoc Loc(N); SmallVector Ops; uint16_t Opcode; unsigned FirstInputOp = Predicated ? 2 : 1; // Two input vectors and the input carry flag Ops.push_back(N->getOperand(FirstInputOp)); Ops.push_back(N->getOperand(FirstInputOp + 1)); SDValue CarryIn = N->getOperand(FirstInputOp + 2); ConstantSDNode *CarryInConstant = dyn_cast(CarryIn); uint32_t CarryMask = 1 << 29; uint32_t CarryExpected = Add ? 0 : CarryMask; if (CarryInConstant && (CarryInConstant->getZExtValue() & CarryMask) == CarryExpected) { Opcode = OpcodeWithNoCarry; } else { Ops.push_back(CarryIn); Opcode = OpcodeWithCarry; } if (Predicated) AddMVEPredicateToOps(Ops, Loc, N->getOperand(FirstInputOp + 3), // predicate N->getOperand(FirstInputOp - 1)); // inactive else AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0)); CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + const uint16_t *OurOpcodes; + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + OurOpcodes = Opcodes[0]; + break; + case 16: + OurOpcodes = Opcodes[1]; + break; + case 32: + OurOpcodes = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VLD"); + } + + EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); + EVT ResultTys[] = {DataTy, MVT::Other}; + + auto Data = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); + SDValue Chain = N->getOperand(0); + for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(2), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); + Data = SDValue(LoadInst, 0); + Chain = SDValue(LoadInst, 1); + } + + for (unsigned i = 0; i < NumVecs; i++) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); + ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->RemoveDeadNode(N); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); bool is64BitVector = VT.is64BitVector(); unsigned Alignment = 0; if (NumVecs != 3) { Alignment = cast(Align)->getZExtValue(); unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8; if (Alignment > NumBytes) Alignment = NumBytes; if (Alignment < 8 && Alignment < NumBytes) Alignment = 0; // Alignment must be a power of two; make sure of that. Alignment = (Alignment & -Alignment); if (Alignment == 1) Alignment = 0; } Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld-dup type"); case MVT::v8i8: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v4i16: case MVT::v8i16: case MVT::v4f16: case MVT::v8f16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v1f64: case MVT::v1i64: OpcodeIndex = 3; break; } unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; if (!is64BitVector) ResTyElts *= 2; EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); std::vector ResTys; ResTys.push_back(ResTy); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SDNode *VLdDup; if (is64BitVector || NumVecs == 1) { SmallVector Ops; Ops.push_back(MemAddr); Ops.push_back(Align); unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes0[OpcodeIndex]; if (isUpdating) { // fixed-stride update instructions don't have an explicit writeback // operand. It's implicit in the opcode itself. SDValue Inc = N->getOperand(2); bool IsImmUpdate = isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); if (NumVecs <= 2 && !IsImmUpdate) Opc = getVLDSTRegisterUpdateOpcode(Opc); if (!IsImmUpdate) Ops.push_back(Inc); // FIXME: VLD3 and VLD4 haven't been updated to that form yet. else if (NumVecs > 2) Ops.push_back(Reg0); } Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); } else if (NumVecs == 2) { const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain }; SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTys, OpsA); Chain = SDValue(VLdA, 1); const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain }; VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); } else { SDValue ImplDef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain }; SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTys, OpsA); SDValue SuperReg = SDValue(VLdA, 0); Chain = SDValue(VLdA, 1); const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain }; VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); } // Transfer memoperands. MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(VLdDup), {MemOp}); // Extract the subregisters. if (NumVecs == 1) { ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0)); } else { SDValue SuperReg = SDValue(VLdDup, 0); static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; for (unsigned Vec = 0; Vec != NumVecs; ++Vec) { ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); } } ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); CurDAG->RemoveDeadNode(N); } bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) return false; unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX) : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX); SDLoc dl(N); // For unsigned extracts, check for a shift right and mask unsigned And_imm = 0; if (N->getOpcode() == ISD::AND) { if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) { // The immediate is a mask of the low bits iff imm & (imm+1) == 0 if (And_imm & (And_imm + 1)) return false; unsigned Srl_imm = 0; if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); // Mask off the unnecessary bits of the AND immediate; normally // DAGCombine will do this, but that might not happen if // targetShrinkDemandedConstant chooses a different immediate. And_imm &= -1U >> Srl_imm; // Note: The width operand is encoded as width-1. unsigned Width = countTrailingOnes(And_imm) - 1; unsigned LSB = Srl_imm; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) { // It's cheaper to use a right shift to extract the top bits. if (Subtarget->isThumb()) { Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri; SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return true; } // ARM models shift instructions as MOVsi with shifter operand. ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL); SDValue ShOpc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB), dl, MVT::i32); SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc, getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops); return true; } assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx"); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return true; } } return false; } // Otherwise, we're looking for a shift of a shift unsigned Shl_imm = 0; if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!"); unsigned Srl_imm = 0; if (isInt32Immediate(N->getOperand(1), Srl_imm)) { assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); // Note: The width operand is encoded as width-1. unsigned Width = 32 - Srl_imm - 1; int LSB = Srl_imm - Shl_imm; if (LSB < 0) return false; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx"); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return true; } } // Or we are looking for a shift of an and, with a mask operand if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) && isShiftedMask_32(And_imm)) { unsigned Srl_imm = 0; unsigned LSB = countTrailingZeros(And_imm); // Shift must be the same as the ands lsb if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) { assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); unsigned MSB = 31 - countLeadingZeros(And_imm); // Note: The width operand is encoded as width-1. unsigned Width = MSB - LSB; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); assert(Srl_imm + Width + 1 <= 32 && "Shouldn't create an invalid ubfx"); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32), CurDAG->getTargetConstant(Width, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return true; } } if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) { unsigned Width = cast(N->getOperand(1))->getVT().getSizeInBits(); unsigned LSB = 0; if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) && !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB)) return false; if (LSB + Width > 32) return false; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); assert(LSB + Width <= 32 && "Shouldn't create an invalid ubfx"); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width - 1, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return true; } return false; } /// Target-specific DAG combining for ISD::XOR. /// Target-independent combining lowers SELECT_CC nodes of the form /// select_cc setg[ge] X, 0, X, -X /// select_cc setgt X, -1, X, -X /// select_cc setl[te] X, 0, -X, X /// select_cc setlt X, 1, -X, X /// which represent Integer ABS into: /// Y = sra (X, size(X)-1); xor (add (X, Y), Y) /// ARM instruction selection detects the latter and matches it to /// ARM::ABS or ARM::t2ABS machine node. bool ARMDAGToDAGISel::tryABSOp(SDNode *N){ SDValue XORSrc0 = N->getOperand(0); SDValue XORSrc1 = N->getOperand(1); EVT VT = N->getValueType(0); if (Subtarget->isThumb1Only()) return false; if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA) return false; SDValue ADDSrc0 = XORSrc0.getOperand(0); SDValue ADDSrc1 = XORSrc0.getOperand(1); SDValue SRASrc0 = XORSrc1.getOperand(0); SDValue SRASrc1 = XORSrc1.getOperand(1); ConstantSDNode *SRAConstant = dyn_cast(SRASrc1); EVT XType = SRASrc0.getValueType(); unsigned Size = XType.getSizeInBits() - 1; if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 && XType.isInteger() && SRAConstant != nullptr && Size == SRAConstant->getZExtValue()) { unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS; CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0); return true; } return false; } /// We've got special pseudo-instructions for these void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) { unsigned Opcode; EVT MemTy = cast(N)->getMemoryVT(); if (MemTy == MVT::i8) Opcode = ARM::CMP_SWAP_8; else if (MemTy == MVT::i16) Opcode = ARM::CMP_SWAP_16; else if (MemTy == MVT::i32) Opcode = ARM::CMP_SWAP_32; else llvm_unreachable("Unknown AtomicCmpSwap type"); SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(0)}; SDNode *CmpSwap = CurDAG->getMachineNode( Opcode, SDLoc(N), CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops); MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(CmpSwap), {MemOp}); ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); CurDAG->RemoveDeadNode(N); } static Optional> getContiguousRangeOfSetBits(const APInt &A) { unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1; unsigned LastOne = A.countTrailingZeros(); if (A.countPopulation() != (FirstOne - LastOne + 1)) return Optional>(); return std::make_pair(FirstOne, LastOne); } void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) { assert(N->getOpcode() == ARMISD::CMPZ); SwitchEQNEToPLMI = false; if (!Subtarget->isThumb()) // FIXME: Work out whether it is profitable to do this in A32 mode - LSL and // LSR don't exist as standalone instructions - they need the barrel shifter. return; // select (cmpz (and X, C), #0) -> (LSLS X) or (LSRS X) or (LSRS (LSLS X)) SDValue And = N->getOperand(0); if (!And->hasOneUse()) return; SDValue Zero = N->getOperand(1); if (!isa(Zero) || !cast(Zero)->isNullValue() || And->getOpcode() != ISD::AND) return; SDValue X = And.getOperand(0); auto C = dyn_cast(And.getOperand(1)); if (!C) return; auto Range = getContiguousRangeOfSetBits(C->getAPIntValue()); if (!Range) return; // There are several ways to lower this: SDNode *NewN; SDLoc dl(N); auto EmitShift = [&](unsigned Opc, SDValue Src, unsigned Imm) -> SDNode* { if (Subtarget->isThumb2()) { Opc = (Opc == ARM::tLSLri) ? ARM::t2LSLri : ARM::t2LSRri; SDValue Ops[] = { Src, CurDAG->getTargetConstant(Imm, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops); } else { SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), Src, CurDAG->getTargetConstant(Imm, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops); } }; if (Range->second == 0) { // 1. Mask includes the LSB -> Simply shift the top N bits off NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first); ReplaceNode(And.getNode(), NewN); } else if (Range->first == 31) { // 2. Mask includes the MSB -> Simply shift the bottom N bits off NewN = EmitShift(ARM::tLSRri, X, Range->second); ReplaceNode(And.getNode(), NewN); } else if (Range->first == Range->second) { // 3. Only one bit is set. We can shift this into the sign bit and use a // PL/MI comparison. NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first); ReplaceNode(And.getNode(), NewN); SwitchEQNEToPLMI = true; } else if (!Subtarget->hasV6T2Ops()) { // 4. Do a double shift to clear bottom and top bits, but only in // thumb-1 mode as in thumb-2 we can use UBFX. NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first); NewN = EmitShift(ARM::tLSRri, SDValue(NewN, 0), Range->second + (31 - Range->first)); ReplaceNode(And.getNode(), NewN); } } void ARMDAGToDAGISel::Select(SDNode *N) { SDLoc dl(N); if (N->isMachineOpcode()) { N->setNodeId(-1); return; // Already selected. } switch (N->getOpcode()) { default: break; case ISD::STORE: { // For Thumb1, match an sp-relative store in C++. This is a little // unfortunate, but I don't think I can make the chain check work // otherwise. (The chain of the store has to be the same as the chain // of the CopyFromReg, or else we can't replace the CopyFromReg with // a direct reference to "SP".) // // This is only necessary on Thumb1 because Thumb1 sp-relative stores use // a different addressing mode from other four-byte stores. // // This pattern usually comes up with call arguments. StoreSDNode *ST = cast(N); SDValue Ptr = ST->getBasePtr(); if (Subtarget->isThumb1Only() && ST->isUnindexed()) { int RHSC = 0; if (Ptr.getOpcode() == ISD::ADD && isScaledConstantInRange(Ptr.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) Ptr = Ptr.getOperand(0); if (Ptr.getOpcode() == ISD::CopyFromReg && cast(Ptr.getOperand(1))->getReg() == ARM::SP && Ptr.getOperand(0) == ST->getChain()) { SDValue Ops[] = {ST->getValue(), CurDAG->getRegister(ARM::SP, MVT::i32), CurDAG->getTargetConstant(RHSC, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), ST->getChain()}; MachineSDNode *ResNode = CurDAG->getMachineNode(ARM::tSTRspi, dl, MVT::Other, Ops); MachineMemOperand *MemOp = ST->getMemOperand(); CurDAG->setNodeMemRefs(cast(ResNode), {MemOp}); ReplaceNode(N, ResNode); return; } } break; } case ISD::WRITE_REGISTER: if (tryWriteRegister(N)) return; break; case ISD::READ_REGISTER: if (tryReadRegister(N)) return; break; case ISD::INLINEASM: case ISD::INLINEASM_BR: if (tryInlineAsm(N)) return; break; case ISD::XOR: // Select special operations if XOR node forms integer ABS pattern if (tryABSOp(N)) return; // Other cases are autogenerated. break; case ISD::Constant: { unsigned Val = cast(N)->getZExtValue(); // If we can't materialize the constant we need to use a literal pool if (ConstantMaterializationCost(Val, Subtarget) > 2) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), TLI->getPointerTy(CurDAG->getDataLayout())); SDNode *ResNode; if (Subtarget->isThumb()) { SDValue Ops[] = { CPIdx, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getEntryNode() }; ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, Ops); } else { SDValue Ops[] = { CPIdx, CurDAG->getTargetConstant(0, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getEntryNode() }; ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, Ops); } // Annotate the Node with memory operand information so that MachineInstr // queries work properly. This e.g. gives the register allocation the // required information for rematerialization. MachineFunction& MF = CurDAG->getMachineFunction(); MachineMemOperand *MemOp = MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, 4, 4); CurDAG->setNodeMemRefs(cast(ResNode), {MemOp}); ReplaceNode(N, ResNode); return; } // Other cases are autogenerated. break; } case ISD::FrameIndex: { // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. int FI = cast(N)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); if (Subtarget->isThumb1Only()) { // Set the alignment of the frame object to 4, to avoid having to generate // more than one ADD MachineFrameInfo &MFI = MF->getFrameInfo(); if (MFI.getObjectAlignment(FI) < 4) MFI.setObjectAlignment(FI, 4); CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, CurDAG->getTargetConstant(0, dl, MVT::i32)); return; } else { unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? ARM::t2ADDri : ARM::ADDri); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); return; } } case ISD::SRL: if (tryV6T2BitfieldExtractOp(N, false)) return; break; case ISD::SIGN_EXTEND_INREG: case ISD::SRA: if (tryV6T2BitfieldExtractOp(N, true)) return; break; case ISD::MUL: if (Subtarget->isThumb1Only()) break; if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { unsigned RHSV = C->getZExtValue(); if (!RHSV) break; if (isPowerOf2_32(RHSV-1)) { // 2^n+1? unsigned ShImm = Log2_32(RHSV-1); if (ShImm >= 32) break; SDValue V = N->getOperand(0); ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if (Subtarget->isThumb()) { SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops); return; } else { SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops); return; } } if (isPowerOf2_32(RHSV+1)) { // 2^n-1? unsigned ShImm = Log2_32(RHSV+1); if (ShImm >= 32) break; SDValue V = N->getOperand(0); ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if (Subtarget->isThumb()) { SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops); return; } else { SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops); return; } } } break; case ISD::AND: { // Check for unsigned bitfield extract if (tryV6T2BitfieldExtractOp(N, false)) return; // If an immediate is used in an AND node, it is possible that the immediate // can be more optimally materialized when negated. If this is the case we // can negate the immediate and use a BIC instead. auto *N1C = dyn_cast(N->getOperand(1)); if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) { uint32_t Imm = (uint32_t) N1C->getZExtValue(); // In Thumb2 mode, an AND can take a 12-bit immediate. If this // immediate can be negated and fit in the immediate operand of // a t2BIC, don't do any manual transform here as this can be // handled by the generic ISel machinery. bool PreferImmediateEncoding = Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm)); if (!PreferImmediateEncoding && ConstantMaterializationCost(Imm, Subtarget) > ConstantMaterializationCost(~Imm, Subtarget)) { // The current immediate costs more to materialize than a negated // immediate, so negate the immediate and use a BIC. SDValue NewImm = CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32); // If the new constant didn't exist before, reposition it in the topological // ordering so it is just before N. Otherwise, don't touch its location. if (NewImm->getNodeId() == -1) CurDAG->RepositionNode(N->getIterator(), NewImm.getNode()); if (!Subtarget->hasThumb2()) { SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), N->getOperand(0), NewImm, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops)); return; } else { SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32)}; ReplaceNode(N, CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops)); return; } } } // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits // are entirely contributed by c2 and lower 16-bits are entirely contributed // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)). // Select it to: "movt x, ((c1 & 0xffff) >> 16) EVT VT = N->getValueType(0); if (VT != MVT::i32) break; unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2()) ? ARM::t2MOVTi16 : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0); if (!Opc) break; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); N1C = dyn_cast(N1); if (!N1C) break; if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) { SDValue N2 = N0.getOperand(1); ConstantSDNode *N2C = dyn_cast(N2); if (!N2C) break; unsigned N1CVal = N1C->getZExtValue(); unsigned N2CVal = N2C->getZExtValue(); if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) && (N1CVal & 0xffffU) == 0xffffU && (N2CVal & 0xffffU) == 0x0U) { SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16, dl, MVT::i32); SDValue Ops[] = { N0.getOperand(0), Imm16, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); return; } } break; } case ARMISD::UMAAL: { unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL; SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops)); return; } case ARMISD::UMLAL:{ if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; ReplaceNode( N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops)); return; }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; ReplaceNode(N, CurDAG->getMachineNode( Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl, MVT::i32, MVT::i32, Ops)); return; } } case ARMISD::SMLAL:{ if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; ReplaceNode( N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops)); return; }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; ReplaceNode(N, CurDAG->getMachineNode( Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl, MVT::i32, MVT::i32, Ops)); return; } } case ARMISD::SUBE: { if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) break; // Look for a pattern to match SMMLS // (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b)))) if (N->getOperand(1).getOpcode() != ISD::SMUL_LOHI || N->getOperand(2).getOpcode() != ARMISD::SUBC || !SDValue(N, 1).use_empty()) break; if (Subtarget->isThumb()) assert(Subtarget->hasThumb2() && "This pattern should not be generated for Thumb"); SDValue SmulLoHi = N->getOperand(1); SDValue Subc = N->getOperand(2); auto *Zero = dyn_cast(Subc.getOperand(0)); if (!Zero || Zero->getZExtValue() != 0 || Subc.getOperand(1) != SmulLoHi.getValue(0) || N->getOperand(1) != SmulLoHi.getValue(1) || N->getOperand(2) != Subc.getValue(1)) break; unsigned Opc = Subtarget->isThumb2() ? ARM::t2SMMLS : ARM::SMMLS; SDValue Ops[] = { SmulLoHi.getOperand(0), SmulLoHi.getOperand(1), N->getOperand(0), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops)); return; } case ISD::LOAD: { if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N)) return; if (Subtarget->isThumb() && Subtarget->hasThumb2()) { if (tryT2IndexedLoad(N)) return; } else if (Subtarget->isThumb()) { if (tryT1IndexedLoad(N)) return; } else if (tryARMIndexedLoad(N)) return; // Other cases are autogenerated. break; } case ARMISD::WLS: case ARMISD::LE: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(0) }; unsigned Opc = N->getOpcode() == ARMISD::WLS ? ARM::t2WhileLoopStart : ARM::t2LoopEnd; SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceUses(N, New); CurDAG->RemoveDeadNode(N); return; } case ARMISD::LOOP_DEC: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(0) }; SDNode *Dec = CurDAG->getMachineNode(ARM::t2LoopDec, dl, CurDAG->getVTList(MVT::i32, MVT::Other), Ops); ReplaceUses(N, Dec); CurDAG->RemoveDeadNode(N); return; } case ARMISD::BRCOND: { // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) // Pattern complexity = 6 cost = 1 size = 0 // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc) // Pattern complexity = 6 cost = 1 size = 0 // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc) // Pattern complexity = 6 cost = 1 size = 0 unsigned Opc = Subtarget->isThumb() ? ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDValue N3 = N->getOperand(3); SDValue InFlag = N->getOperand(4); assert(N1.getOpcode() == ISD::BasicBlock); assert(N2.getOpcode() == ISD::Constant); assert(N3.getOpcode() == ISD::Register); unsigned CC = (unsigned) cast(N2)->getZExtValue(); if (InFlag.getOpcode() == ARMISD::CMPZ) { if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { SDValue Int = InFlag.getOperand(0); uint64_t ID = cast(Int->getOperand(1))->getZExtValue(); // Handle low-overhead loops. if (ID == Intrinsic::loop_decrement_reg) { SDValue Elements = Int.getOperand(2); SDValue Size = CurDAG->getTargetConstant( cast(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); SDValue Args[] = { Elements, Size, Int.getOperand(0) }; SDNode *LoopDec = CurDAG->getMachineNode(ARM::t2LoopDec, dl, CurDAG->getVTList(MVT::i32, MVT::Other), Args); ReplaceUses(Int.getNode(), LoopDec); SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain }; SDNode *LoopEnd = CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs); ReplaceUses(N, LoopEnd); CurDAG->RemoveDeadNode(N); CurDAG->RemoveDeadNode(InFlag.getNode()); CurDAG->RemoveDeadNode(Int.getNode()); return; } } bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); InFlag = N->getOperand(4); if (SwitchEQNEToPLMI) { switch ((ARMCC::CondCodes)CC) { default: llvm_unreachable("CMPZ must be either NE or EQ!"); case ARMCC::NE: CC = (unsigned)ARMCC::MI; break; case ARMCC::EQ: CC = (unsigned)ARMCC::PL; break; } } } SDValue Tmp2 = CurDAG->getTargetConstant(CC, dl, MVT::i32); SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, MVT::Glue, Ops); Chain = SDValue(ResNode, 0); if (N->getNumValues() == 2) { InFlag = SDValue(ResNode, 1); ReplaceUses(SDValue(N, 1), InFlag); } ReplaceUses(SDValue(N, 0), SDValue(Chain.getNode(), Chain.getResNo())); CurDAG->RemoveDeadNode(N); return; } case ARMISD::CMPZ: { // select (CMPZ X, #-C) -> (CMPZ (ADDS X, #C), #0) // This allows us to avoid materializing the expensive negative constant. // The CMPZ #0 is useless and will be peepholed away but we need to keep it // for its glue output. SDValue X = N->getOperand(0); auto *C = dyn_cast(N->getOperand(1).getNode()); if (C && C->getSExtValue() < 0 && Subtarget->isThumb()) { int64_t Addend = -C->getSExtValue(); SDNode *Add = nullptr; // ADDS can be better than CMN if the immediate fits in a // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3. // Outside that range we can just use a CMN which is 32-bit but has a // 12-bit immediate range. if (Addend < 1<<8) { if (Subtarget->isThumb2()) { SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops); } else { unsigned Opc = (Addend < 1<<3) ? ARM::tADDi3 : ARM::tADDi8; SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X, CurDAG->getTargetConstant(Addend, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; Add = CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops); } } if (Add) { SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)}; CurDAG->MorphNodeTo(N, ARMISD::CMPZ, CurDAG->getVTList(MVT::Glue), Ops2); } } // Other cases are autogenerated. break; } case ARMISD::CMOV: { SDValue InFlag = N->getOperand(4); if (InFlag.getOpcode() == ARMISD::CMPZ) { bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); if (SwitchEQNEToPLMI) { SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); switch (CC) { default: llvm_unreachable("CMPZ must be either NE or EQ!"); case ARMCC::NE: CC = ARMCC::MI; break; case ARMCC::EQ: CC = ARMCC::PL; break; } SDValue NewARMcc = CurDAG->getConstant((unsigned)CC, dl, MVT::i32); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), NewARMcc, N->getOperand(3), N->getOperand(4)}; CurDAG->MorphNodeTo(N, ARMISD::CMOV, N->getVTList(), Ops); } } // Other cases are autogenerated. break; } case ARMISD::VZIP: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VZIPd8; break; case MVT::v4f16: case MVT::v4i16: Opc = ARM::VZIPd16; break; case MVT::v2f32: // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VZIPq8; break; case MVT::v8f16: case MVT::v8i16: Opc = ARM::VZIPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VZIPq32; break; } SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); return; } case ARMISD::VUZP: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VUZPd8; break; case MVT::v4f16: case MVT::v4i16: Opc = ARM::VUZPd16; break; case MVT::v2f32: // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VUZPq8; break; case MVT::v8f16: case MVT::v8i16: Opc = ARM::VUZPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VUZPq32; break; } SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); return; } case ARMISD::VTRN: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VTRNd8; break; case MVT::v4f16: case MVT::v4i16: Opc = ARM::VTRNd16; break; case MVT::v2f32: case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VTRNq8; break; case MVT::v8f16: case MVT::v8i16: Opc = ARM::VTRNq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VTRNq32; break; } SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); return; } case ARMISD::BUILD_VECTOR: { EVT VecVT = N->getValueType(0); EVT EltVT = VecVT.getVectorElementType(); unsigned NumElts = VecVT.getVectorNumElements(); if (EltVT == MVT::f64) { assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); ReplaceNode( N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1))); return; } assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR"); if (NumElts == 2) { ReplaceNode( N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1))); return; } assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); ReplaceNode(N, createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3))); return; } case ARMISD::VLD1DUP: { static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16, ARM::VLD1DUPd32 }; static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16, ARM::VLD1DUPq32 }; SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes); return; } case ARMISD::VLD2DUP: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32 }; SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes); return; } case ARMISD::VLD3DUP: { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo }; SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes); return; } case ARMISD::VLD4DUP: { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo }; SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes); return; } case ARMISD::VLD1DUP_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed, ARM::VLD1DUPd16wb_fixed, ARM::VLD1DUPd32wb_fixed }; static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed, ARM::VLD1DUPq16wb_fixed, ARM::VLD1DUPq32wb_fixed }; SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes); return; } case ARMISD::VLD2DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, ARM::VLD2DUPd32wb_fixed }; SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes); return; } case ARMISD::VLD3DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd32Pseudo_UPD }; SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes); return; } case ARMISD::VLD4DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd32Pseudo_UPD }; SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes); return; } case ARMISD::VLD1_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed, ARM::VLD1d16wb_fixed, ARM::VLD1d32wb_fixed, ARM::VLD1d64wb_fixed }; static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed, ARM::VLD1q16wb_fixed, ARM::VLD1q32wb_fixed, ARM::VLD1q64wb_fixed }; SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr); return; } case ARMISD::VLD2_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed, ARM::VLD1q64wb_fixed}; static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q32PseudoWB_fixed }; SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); return; } case ARMISD::VLD3_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q16Pseudo_UPD, ARM::VLD3q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q32oddPseudo_UPD }; SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VLD4_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, ARM::VLD4q16Pseudo_UPD, ARM::VLD4q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q32oddPseudo_UPD }; SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VLD2LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq32Pseudo_UPD }; SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); return; } case ARMISD::VLD3LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq32Pseudo_UPD }; SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); return; } case ARMISD::VLD4LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq32Pseudo_UPD }; SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); return; } case ARMISD::VST1_UPD: { static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed, ARM::VST1d16wb_fixed, ARM::VST1d32wb_fixed, ARM::VST1d64wb_fixed }; static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed, ARM::VST1q16wb_fixed, ARM::VST1q32wb_fixed, ARM::VST1q64wb_fixed }; SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr); return; } case ARMISD::VST2_UPD: { static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed, ARM::VST1q64wb_fixed}; static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, ARM::VST2q16PseudoWB_fixed, ARM::VST2q32PseudoWB_fixed }; SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); return; } case ARMISD::VST3_UPD: { static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD, ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, ARM::VST3q16Pseudo_UPD, ARM::VST3q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, ARM::VST3q16oddPseudo_UPD, ARM::VST3q32oddPseudo_UPD }; SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VST4_UPD: { static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, ARM::VST4q16Pseudo_UPD, ARM::VST4q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, ARM::VST4q16oddPseudo_UPD, ARM::VST4q32oddPseudo_UPD }; SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case ARMISD::VST2LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq32Pseudo_UPD }; SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); return; } case ARMISD::VST3LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq32Pseudo_UPD }; SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); return; } case ARMISD::VST4LN_UPD: { static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq32Pseudo_UPD }; SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); return; } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::arm_mrrc: case Intrinsic::arm_mrrc2: { SDLoc dl(N); SDValue Chain = N->getOperand(0); unsigned Opc; if (Subtarget->isThumb()) Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2); else Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2); SmallVector Ops; Ops.push_back(getI32Imm(cast(N->getOperand(2))->getZExtValue(), dl)); /* coproc */ Ops.push_back(getI32Imm(cast(N->getOperand(3))->getZExtValue(), dl)); /* opc */ Ops.push_back(getI32Imm(cast(N->getOperand(4))->getZExtValue(), dl)); /* CRm */ // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded // instruction will always be '1111' but it is possible in assembly language to specify // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction. if (Opc != ARM::MRRC2) { Ops.push_back(getAL(CurDAG, dl)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); } Ops.push_back(Chain); // Writes to two registers. const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other}; ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops)); return; } case Intrinsic::arm_ldaexd: case Intrinsic::arm_ldrexd: { SDLoc dl(N); SDValue Chain = N->getOperand(0); SDValue MemAddr = N->getOperand(2); bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps(); bool IsAcquire = IntNo == Intrinsic::arm_ldaexd; unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD) : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD); // arm_ldrexd returns a i64 value in {i32, i32} std::vector ResTys; if (isThumb) { ResTys.push_back(MVT::i32); ResTys.push_back(MVT::i32); } else ResTys.push_back(MVT::Untyped); ResTys.push_back(MVT::Other); // Place arguments in the right order. SDValue Ops[] = {MemAddr, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), Chain}; SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(Ld), {MemOp}); // Remap uses. SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1); if (!SDValue(N, 0).use_empty()) { SDValue Result; if (isThumb) Result = SDValue(Ld, 0); else { SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32); SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32, SDValue(Ld, 0), SubRegIdx); Result = SDValue(ResNode,0); } ReplaceUses(SDValue(N, 0), Result); } if (!SDValue(N, 1).use_empty()) { SDValue Result; if (isThumb) Result = SDValue(Ld, 1); else { SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32); SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32, SDValue(Ld, 0), SubRegIdx); Result = SDValue(ResNode,0); } ReplaceUses(SDValue(N, 1), Result); } ReplaceUses(SDValue(N, 2), OutChain); CurDAG->RemoveDeadNode(N); return; } case Intrinsic::arm_stlexd: case Intrinsic::arm_strexd: { SDLoc dl(N); SDValue Chain = N->getOperand(0); SDValue Val0 = N->getOperand(2); SDValue Val1 = N->getOperand(3); SDValue MemAddr = N->getOperand(4); // Store exclusive double return a i32 value which is the return status // of the issued store. const EVT ResTys[] = {MVT::i32, MVT::Other}; bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); // Place arguments in the right order. SmallVector Ops; if (isThumb) { Ops.push_back(Val0); Ops.push_back(Val1); } else // arm_strexd uses GPRPair. Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0)); Ops.push_back(MemAddr); Ops.push_back(getAL(CurDAG, dl)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(Chain); bool IsRelease = IntNo == Intrinsic::arm_stlexd; unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD) : (IsRelease ? ARM::STLEXD : ARM::STREXD); SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. MachineMemOperand *MemOp = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(St), {MemOp}); ReplaceNode(N, St); return; } case Intrinsic::arm_neon_vld1: { static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, ARM::VLD1d32, ARM::VLD1d64 }; static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, ARM::VLD1q32, ARM::VLD1q64}; SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vld1x2: { static const uint16_t DOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, ARM::VLD1q32, ARM::VLD1q64 }; static const uint16_t QOpcodes[] = { ARM::VLD1d8QPseudo, ARM::VLD1d16QPseudo, ARM::VLD1d32QPseudo, ARM::VLD1d64QPseudo }; SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vld1x3: { static const uint16_t DOpcodes[] = { ARM::VLD1d8TPseudo, ARM::VLD1d16TPseudo, ARM::VLD1d32TPseudo, ARM::VLD1d64TPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1q16LowTPseudo_UPD, ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1q64LowTPseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighTPseudo, ARM::VLD1q16HighTPseudo, ARM::VLD1q32HighTPseudo, ARM::VLD1q64HighTPseudo }; SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld1x4: { static const uint16_t DOpcodes[] = { ARM::VLD1d8QPseudo, ARM::VLD1d16QPseudo, ARM::VLD1d32QPseudo, ARM::VLD1d64QPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1q16LowQPseudo_UPD, ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1q64LowQPseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighQPseudo, ARM::VLD1q16HighQPseudo, ARM::VLD1q32HighQPseudo, ARM::VLD1q64HighQPseudo }; SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld2: { static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, ARM::VLD2d32, ARM::VLD1q64 }; static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, ARM::VLD2q32Pseudo }; SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vld3: { static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo, ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q16Pseudo_UPD, ARM::VLD3q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo, ARM::VLD3q16oddPseudo, ARM::VLD3q32oddPseudo }; SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld4: { static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo, ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, ARM::VLD4q16Pseudo_UPD, ARM::VLD4q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo, ARM::VLD4q16oddPseudo, ARM::VLD4q32oddPseudo }; SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld2dup: { static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32, ARM::VLD1q64 }; static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPq32EvenPseudo }; static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPq32OddPseudo }; SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld3dup: { static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo, ARM::VLD1d64TPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq32EvenPseudo }; static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq32OddPseudo }; SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld4dup: { static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo, ARM::VLD1d64QPseudo }; static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq32EvenPseudo }; static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq32OddPseudo }; SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vld2lane: { static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, ARM::VLD2LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); return; } case Intrinsic::arm_neon_vld3lane: { static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, ARM::VLD3LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); return; } case Intrinsic::arm_neon_vld4lane: { static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, ARM::VLD4LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); return; } case Intrinsic::arm_neon_vst1: { static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, ARM::VST1d32, ARM::VST1d64 }; static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, ARM::VST1q32, ARM::VST1q64 }; SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vst1x2: { static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16, ARM::VST1q32, ARM::VST1q64 }; static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo, ARM::VST1d16QPseudo, ARM::VST1d32QPseudo, ARM::VST1d64QPseudo }; SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vst1x3: { static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo, ARM::VST1d16TPseudo, ARM::VST1d32TPseudo, ARM::VST1d64TPseudo }; static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD, ARM::VST1q16LowTPseudo_UPD, ARM::VST1q32LowTPseudo_UPD, ARM::VST1q64LowTPseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo, ARM::VST1q16HighTPseudo, ARM::VST1q32HighTPseudo, ARM::VST1q64HighTPseudo }; SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vst1x4: { static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo, ARM::VST1d16QPseudo, ARM::VST1d32QPseudo, ARM::VST1d64QPseudo }; static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD, ARM::VST1q16LowQPseudo_UPD, ARM::VST1q32LowQPseudo_UPD, ARM::VST1q64LowQPseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo, ARM::VST1q16HighQPseudo, ARM::VST1q32HighQPseudo, ARM::VST1q64HighQPseudo }; SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vst2: { static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, ARM::VST2d32, ARM::VST1q64 }; static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, ARM::VST2q32Pseudo }; SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); return; } case Intrinsic::arm_neon_vst3: { static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo, ARM::VST3d32Pseudo, ARM::VST1d64TPseudo }; static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, ARM::VST3q16Pseudo_UPD, ARM::VST3q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo, ARM::VST3q16oddPseudo, ARM::VST3q32oddPseudo }; SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vst4: { static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo, ARM::VST4d32Pseudo, ARM::VST1d64QPseudo }; static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, ARM::VST4q16Pseudo_UPD, ARM::VST4q32Pseudo_UPD }; static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo, ARM::VST4q16oddPseudo, ARM::VST4q32oddPseudo }; SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); return; } case Intrinsic::arm_neon_vst2lane: { static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, ARM::VST2LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); return; } case Intrinsic::arm_neon_vst3lane: { static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, ARM::VST3LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); return; } case Intrinsic::arm_neon_vst4lane: { static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, ARM::VST4LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); return; } case Intrinsic::arm_mve_vldr_gather_base_wb: case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { static const uint16_t Opcodes[] = {ARM::MVE_VLDRWU32_qi_pre, ARM::MVE_VLDRDU64_qi_pre}; SelectMVE_WB(N, Opcodes, IntNo == Intrinsic::arm_mve_vldr_gather_base_wb_predicated); return; } + + case Intrinsic::arm_mve_vld2q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, ARM::MVE_VLD21_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes); + return; + } + + case Intrinsic::arm_mve_vld4q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, ARM::MVE_VLD43_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes); + return; + } } break; } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::arm_mve_urshrl: SelectMVE_LongShift(N, ARM::MVE_URSHRL, true); return; case Intrinsic::arm_mve_vadc: case Intrinsic::arm_mve_vadc_predicated: SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true, IntNo == Intrinsic::arm_mve_vadc_predicated); return; } break; } case ISD::ATOMIC_CMP_SWAP: SelectCMP_SWAP(N); return; } SelectCode(N); } // Inspect a register string of the form // cp::c:c: (32bit) or // cp::c (64bit) inspect the fields of the string // and obtain the integer operands from them, adding these operands to the // provided vector. static void getIntOperandsFromRegisterString(StringRef RegString, SelectionDAG *CurDAG, const SDLoc &DL, std::vector &Ops) { SmallVector Fields; RegString.split(Fields, ':'); if (Fields.size() > 1) { bool AllIntFields = true; for (StringRef Field : Fields) { // Need to trim out leading 'cp' characters and get the integer field. unsigned IntField; AllIntFields &= !Field.trim("CPcp").getAsInteger(10, IntField); Ops.push_back(CurDAG->getTargetConstant(IntField, DL, MVT::i32)); } assert(AllIntFields && "Unexpected non-integer value in special register string."); } } // Maps a Banked Register string to its mask value. The mask value returned is // for use in the MRSbanked / MSRbanked instruction nodes as the Banked Register // mask operand, which expresses which register is to be used, e.g. r8, and in // which mode it is to be used, e.g. usr. Returns -1 to signify that the string // was invalid. static inline int getBankedRegisterMask(StringRef RegString) { auto TheReg = ARMBankedReg::lookupBankedRegByName(RegString.lower()); if (!TheReg) return -1; return TheReg->Encoding; } // The flags here are common to those allowed for apsr in the A class cores and // those allowed for the special registers in the M class cores. Returns a // value representing which flags were present, -1 if invalid. static inline int getMClassFlagsMask(StringRef Flags) { return StringSwitch(Flags) .Case("", 0x2) // no flags means nzcvq for psr registers, and 0x2 is // correct when flags are not permitted .Case("g", 0x1) .Case("nzcvq", 0x2) .Case("nzcvqg", 0x3) .Default(-1); } // Maps MClass special registers string to its value for use in the // t2MRS_M/t2MSR_M instruction nodes as the SYSm value operand. // Returns -1 to signify that the string was invalid. static int getMClassRegisterMask(StringRef Reg, const ARMSubtarget *Subtarget) { auto TheReg = ARMSysReg::lookupMClassSysRegByName(Reg); const FeatureBitset &FeatureBits = Subtarget->getFeatureBits(); if (!TheReg || !TheReg->hasRequiredFeatures(FeatureBits)) return -1; return (int)(TheReg->Encoding & 0xFFF); // SYSm value } static int getARClassRegisterMask(StringRef Reg, StringRef Flags) { // The mask operand contains the special register (R Bit) in bit 4, whether // the register is spsr (R bit is 1) or one of cpsr/apsr (R bit is 0), and // bits 3-0 contains the fields to be accessed in the special register, set by // the flags provided with the register. int Mask = 0; if (Reg == "apsr") { // The flags permitted for apsr are the same flags that are allowed in // M class registers. We get the flag value and then shift the flags into // the correct place to combine with the mask. Mask = getMClassFlagsMask(Flags); if (Mask == -1) return -1; return Mask << 2; } if (Reg != "cpsr" && Reg != "spsr") { return -1; } // This is the same as if the flags were "fc" if (Flags.empty() || Flags == "all") return Mask | 0x9; // Inspect the supplied flags string and set the bits in the mask for // the relevant and valid flags allowed for cpsr and spsr. for (char Flag : Flags) { int FlagVal; switch (Flag) { case 'c': FlagVal = 0x1; break; case 'x': FlagVal = 0x2; break; case 's': FlagVal = 0x4; break; case 'f': FlagVal = 0x8; break; default: FlagVal = 0; } // This avoids allowing strings where the same flag bit appears twice. if (!FlagVal || (Mask & FlagVal)) return -1; Mask |= FlagVal; } // If the register is spsr then we need to set the R bit. if (Reg == "spsr") Mask |= 0x10; return Mask; } // Lower the read_register intrinsic to ARM specific DAG nodes // using the supplied metadata string to select the instruction node to use // and the registers/masks to construct as operands for the node. bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){ const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); bool IsThumb2 = Subtarget->isThumb2(); SDLoc DL(N); std::vector Ops; getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops); if (!Ops.empty()) { // If the special register string was constructed of fields (as defined // in the ACLE) then need to lower to MRC node (32 bit) or // MRRC node(64 bit), we can make the distinction based on the number of // operands we have. unsigned Opcode; SmallVector ResTypes; if (Ops.size() == 5){ Opcode = IsThumb2 ? ARM::t2MRC : ARM::MRC; ResTypes.append({ MVT::i32, MVT::Other }); } else { assert(Ops.size() == 3 && "Invalid number of fields in special register string."); Opcode = IsThumb2 ? ARM::t2MRRC : ARM::MRRC; ResTypes.append({ MVT::i32, MVT::i32, MVT::Other }); } Ops.push_back(getAL(CurDAG, DL)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(N->getOperand(0)); ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops)); return true; } std::string SpecialReg = RegString->getString().lower(); int BankedReg = getBankedRegisterMask(SpecialReg); if (BankedReg != -1) { Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode( N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked, DL, MVT::i32, MVT::Other, Ops)); return true; } // The VFP registers are read by creating SelectionDAG nodes with opcodes // corresponding to the register that is being read from. So we switch on the // string to find which opcode we need to use. unsigned Opcode = StringSwitch(SpecialReg) .Case("fpscr", ARM::VMRS) .Case("fpexc", ARM::VMRS_FPEXC) .Case("fpsid", ARM::VMRS_FPSID) .Case("mvfr0", ARM::VMRS_MVFR0) .Case("mvfr1", ARM::VMRS_MVFR1) .Case("mvfr2", ARM::VMRS_MVFR2) .Case("fpinst", ARM::VMRS_FPINST) .Case("fpinst2", ARM::VMRS_FPINST2) .Default(0); // If an opcode was found then we can lower the read to a VFP instruction. if (Opcode) { if (!Subtarget->hasVFP2Base()) return false; if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8Base()) return false; Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops)); return true; } // If the target is M Class then need to validate that the register string // is an acceptable value, so check that a mask can be constructed from the // string. if (Subtarget->isMClass()) { int SYSmValue = getMClassRegisterMask(SpecialReg, Subtarget); if (SYSmValue == -1) return false; SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode( N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops)); return true; } // Here we know the target is not M Class so we need to check if it is one // of the remaining possible values which are apsr, cpsr or spsr. if (SpecialReg == "apsr" || SpecialReg == "cpsr") { Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL, MVT::i32, MVT::Other, Ops)); return true; } if (SpecialReg == "spsr") { Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode( N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL, MVT::i32, MVT::Other, Ops)); return true; } return false; } // Lower the write_register intrinsic to ARM specific DAG nodes // using the supplied metadata string to select the instruction node to use // and the registers/masks to use in the nodes bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){ const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); bool IsThumb2 = Subtarget->isThumb2(); SDLoc DL(N); std::vector Ops; getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops); if (!Ops.empty()) { // If the special register string was constructed of fields (as defined // in the ACLE) then need to lower to MCR node (32 bit) or // MCRR node(64 bit), we can make the distinction based on the number of // operands we have. unsigned Opcode; if (Ops.size() == 5) { Opcode = IsThumb2 ? ARM::t2MCR : ARM::MCR; Ops.insert(Ops.begin()+2, N->getOperand(2)); } else { assert(Ops.size() == 3 && "Invalid number of fields in special register string."); Opcode = IsThumb2 ? ARM::t2MCRR : ARM::MCRR; SDValue WriteValue[] = { N->getOperand(2), N->getOperand(3) }; Ops.insert(Ops.begin()+2, WriteValue, WriteValue+2); } Ops.push_back(getAL(CurDAG, DL)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(N->getOperand(0)); ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops)); return true; } std::string SpecialReg = RegString->getString().lower(); int BankedReg = getBankedRegisterMask(SpecialReg); if (BankedReg != -1) { Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode( N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked, DL, MVT::Other, Ops)); return true; } // The VFP registers are written to by creating SelectionDAG nodes with // opcodes corresponding to the register that is being written. So we switch // on the string to find which opcode we need to use. unsigned Opcode = StringSwitch(SpecialReg) .Case("fpscr", ARM::VMSR) .Case("fpexc", ARM::VMSR_FPEXC) .Case("fpsid", ARM::VMSR_FPSID) .Case("fpinst", ARM::VMSR_FPINST) .Case("fpinst2", ARM::VMSR_FPINST2) .Default(0); if (Opcode) { if (!Subtarget->hasVFP2Base()) return false; Ops = { N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops)); return true; } std::pair Fields; Fields = StringRef(SpecialReg).rsplit('_'); std::string Reg = Fields.first.str(); StringRef Flags = Fields.second; // If the target was M Class then need to validate the special register value // and retrieve the mask for use in the instruction node. if (Subtarget->isMClass()) { int SYSmValue = getMClassRegisterMask(SpecialReg, Subtarget); if (SYSmValue == -1) return false; SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops)); return true; } // We then check to see if a valid mask can be constructed for one of the // register string values permitted for the A and R class cores. These values // are apsr, spsr and cpsr; these are also valid on older cores. int Mask = getARClassRegisterMask(Reg, Flags); if (Mask != -1) { Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR, DL, MVT::Other, Ops)); return true; } return false; } bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ std::vector AsmNodeOperands; unsigned Flag, Kind; bool Changed = false; unsigned NumOps = N->getNumOperands(); // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint. // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs // respectively. Since there is no constraint to explicitly specify a // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb, // the 64-bit data may be referred by H, Q, R modifiers, so we still pack // them into a GPRPair. SDLoc dl(N); SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(nullptr,0); SmallVector OpChanged; // Glue node will be appended late. for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) { SDValue op = N->getOperand(i); AsmNodeOperands.push_back(op); if (i < InlineAsm::Op_FirstOperand) continue; if (ConstantSDNode *C = dyn_cast(N->getOperand(i))) { Flag = C->getZExtValue(); Kind = InlineAsm::getKind(Flag); } else continue; // Immediate operands to inline asm in the SelectionDAG are modeled with // two operands. The first is a constant of value InlineAsm::Kind_Imm, and // the second is a constant with the value of the immediate. If we get here // and we have a Kind_Imm, skip the next operand, and continue. if (Kind == InlineAsm::Kind_Imm) { SDValue op = N->getOperand(++i); AsmNodeOperands.push_back(op); continue; } unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag); if (NumRegs) OpChanged.push_back(false); unsigned DefIdx = 0; bool IsTiedToChangedOp = false; // If it's a use that is tied with a previous def, it has no // reg class constraint. if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) IsTiedToChangedOp = OpChanged[DefIdx]; // Memory operands to inline asm in the SelectionDAG are modeled with two // operands: a constant of value InlineAsm::Kind_Mem followed by the input // operand. If we get here and we have a Kind_Mem, skip the next operand (so // it doesn't get misinterpreted), and continue. We do this here because // it's important to update the OpChanged array correctly before moving on. if (Kind == InlineAsm::Kind_Mem) { SDValue op = N->getOperand(++i); AsmNodeOperands.push_back(op); continue; } if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef && Kind != InlineAsm::Kind_RegDefEarlyClobber) continue; unsigned RC; bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC); if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID)) || NumRegs != 2) continue; assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); SDValue V0 = N->getOperand(i+1); SDValue V1 = N->getOperand(i+2); unsigned Reg0 = cast(V0)->getReg(); unsigned Reg1 = cast(V1)->getReg(); SDValue PairedReg; MachineRegisterInfo &MRI = MF->getRegInfo(); if (Kind == InlineAsm::Kind_RegDef || Kind == InlineAsm::Kind_RegDefEarlyClobber) { // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to // the original GPRs. Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); SDValue Chain = SDValue(N,0); SDNode *GU = N->getGluedUser(); SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped, Chain.getValue(1)); // Extract values from a GPRPair reg and copy to the original GPR reg. SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32, RegCopy); SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, RegCopy); SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0, RegCopy.getValue(1)); SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1)); // Update the original glue user. std::vector Ops(GU->op_begin(), GU->op_end()-1); Ops.push_back(T1.getValue(1)); CurDAG->UpdateNodeOperands(GU, Ops); } else { // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a // GPRPair and then pass the GPRPair to the inline asm. SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain]; // As REG_SEQ doesn't take RegisterSDNode, we copy them first. SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32, Chain.getValue(1)); SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32, T0.getValue(1)); SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0); // Copy REG_SEQ into a GPRPair-typed VR and replace the original two // i32 VRs of inline asm with it. Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; Glue = Chain.getValue(1); } Changed = true; if(PairedReg.getNode()) { OpChanged[OpChanged.size() -1 ] = true; Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/); if (IsTiedToChangedOp) Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx); else Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID); // Replace the current flag. AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant( Flag, dl, MVT::i32); // Add the new register node and skip the original two GPRs. AsmNodeOperands.push_back(PairedReg); // Skip the next two GPRs. i += 2; } } if (Glue.getNode()) AsmNodeOperands.push_back(Glue); if (!Changed) return false; SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N), CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); New->setNodeId(-1); ReplaceNode(N, New.getNode()); return true; } bool ARMDAGToDAGISel:: SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) { switch(ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); case InlineAsm::Constraint_i: // FIXME: It seems strange that 'i' is needed here since it's supposed to // be an immediate and not a memory constraint. LLVM_FALLTHROUGH; case InlineAsm::Constraint_m: case InlineAsm::Constraint_o: case InlineAsm::Constraint_Q: case InlineAsm::Constraint_Um: case InlineAsm::Constraint_Un: case InlineAsm::Constraint_Uq: case InlineAsm::Constraint_Us: case InlineAsm::Constraint_Ut: case InlineAsm::Constraint_Uv: case InlineAsm::Constraint_Uy: // Require the address to be in a register. That is safe for all ARM // variants and it is hard to do anything much smarter without knowing // how the operand is used. OutOps.push_back(Op); return false; } return true; } /// createARMISelDag - This pass converts a legalized DAG into a /// ARM-specific DAG, ready for instruction scheduling. /// FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel) { return new ARMDAGToDAGISel(TM, OptLevel); } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 7d49df3d0c07..e43d64393a6b 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1,5531 +1,5554 @@ //===-- ARMInstrMVE.td - MVE support for ARM ---------------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the ARM MVE instruction set. // //===----------------------------------------------------------------------===// class ExpandImmAsmOp : AsmOperandClass { let Name = !strconcat("ExpandImm", shift); let PredicateMethod = !strconcat("isExpImm<", shift, ">"); let RenderMethod = "addImmOperands"; } class InvertedExpandImmAsmOp : AsmOperandClass { let Name = !strconcat("InvertedExpandImm", shift, "_", size); let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">"); let RenderMethod = "addImmOperands"; } class ExpandImm : Operand { let ParserMatchClass = ExpandImmAsmOp; let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>"); let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">"); let PrintMethod = "printExpandedImmOperand"; } class InvertedExpandImm : Operand { let ParserMatchClass = InvertedExpandImmAsmOp; let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>"); let PrintMethod = "printExpandedImmOperand"; // No decoder method needed, because this operand type is only used // by aliases (VAND and VORN) } def expzero00 : ExpandImm<"0">; def expzero08 : ExpandImm<"8">; def expzero16 : ExpandImm<"16">; def expzero24 : ExpandImm<"24">; def expzero00inv16 : InvertedExpandImm<"0", "16">; def expzero08inv16 : InvertedExpandImm<"8", "16">; def expzero00inv32 : InvertedExpandImm<"0", "32">; def expzero08inv32 : InvertedExpandImm<"8", "32">; def expzero16inv32 : InvertedExpandImm<"16", "32">; def expzero24inv32 : InvertedExpandImm<"24", "32">; // VPT condition mask def vpt_mask : Operand { let PrintMethod = "printVPTMask"; let ParserMatchClass = it_mask_asmoperand; let EncoderMethod = "getVPTMaskOpValue"; let DecoderMethod = "DecodeVPTMaskOperand"; } // VPT/VCMP restricted predicate for sign invariant types def pred_restricted_i_asmoperand : AsmOperandClass { let Name = "CondCodeRestrictedI"; let RenderMethod = "addITCondCodeOperands"; let PredicateMethod = "isITCondCodeRestrictedI"; let ParserMethod = "parseITCondCode"; let DiagnosticString = "condition code for sign-independent integer "# "comparison must be EQ or NE"; } // VPT/VCMP restricted predicate for signed types def pred_restricted_s_asmoperand : AsmOperandClass { let Name = "CondCodeRestrictedS"; let RenderMethod = "addITCondCodeOperands"; let PredicateMethod = "isITCondCodeRestrictedS"; let ParserMethod = "parseITCondCode"; let DiagnosticString = "condition code for signed integer "# "comparison must be EQ, NE, LT, GT, LE or GE"; } // VPT/VCMP restricted predicate for unsigned types def pred_restricted_u_asmoperand : AsmOperandClass { let Name = "CondCodeRestrictedU"; let RenderMethod = "addITCondCodeOperands"; let PredicateMethod = "isITCondCodeRestrictedU"; let ParserMethod = "parseITCondCode"; let DiagnosticString = "condition code for unsigned integer "# "comparison must be EQ, NE, HS or HI"; } // VPT/VCMP restricted predicate for floating point def pred_restricted_fp_asmoperand : AsmOperandClass { let Name = "CondCodeRestrictedFP"; let RenderMethod = "addITCondCodeOperands"; let PredicateMethod = "isITCondCodeRestrictedFP"; let ParserMethod = "parseITCondCode"; let DiagnosticString = "condition code for floating-point "# "comparison must be EQ, NE, LT, GT, LE or GE"; } class VCMPPredicateOperand : Operand; def pred_basic_i : VCMPPredicateOperand { let PrintMethod = "printMandatoryRestrictedPredicateOperand"; let ParserMatchClass = pred_restricted_i_asmoperand; let DecoderMethod = "DecodeRestrictedIPredicateOperand"; let EncoderMethod = "getRestrictedCondCodeOpValue"; } def pred_basic_u : VCMPPredicateOperand { let PrintMethod = "printMandatoryRestrictedPredicateOperand"; let ParserMatchClass = pred_restricted_u_asmoperand; let DecoderMethod = "DecodeRestrictedUPredicateOperand"; let EncoderMethod = "getRestrictedCondCodeOpValue"; } def pred_basic_s : VCMPPredicateOperand { let PrintMethod = "printMandatoryRestrictedPredicateOperand"; let ParserMatchClass = pred_restricted_s_asmoperand; let DecoderMethod = "DecodeRestrictedSPredicateOperand"; let EncoderMethod = "getRestrictedCondCodeOpValue"; } def pred_basic_fp : VCMPPredicateOperand { let PrintMethod = "printMandatoryRestrictedPredicateOperand"; let ParserMatchClass = pred_restricted_fp_asmoperand; let DecoderMethod = "DecodeRestrictedFPPredicateOperand"; let EncoderMethod = "getRestrictedCondCodeOpValue"; } // Register list operands for interleaving load/stores def VecList2QAsmOperand : AsmOperandClass { let Name = "VecListTwoMQ"; let ParserMethod = "parseVectorList"; let RenderMethod = "addMVEVecListOperands"; let DiagnosticString = "operand must be a list of two consecutive "# "q-registers in range [q0,q7]"; } def VecList2Q : RegisterOperand { let ParserMatchClass = VecList2QAsmOperand; let PrintMethod = "printMVEVectorList<2>"; } def VecList4QAsmOperand : AsmOperandClass { let Name = "VecListFourMQ"; let ParserMethod = "parseVectorList"; let RenderMethod = "addMVEVecListOperands"; let DiagnosticString = "operand must be a list of four consecutive "# "q-registers in range [q0,q7]"; } def VecList4Q : RegisterOperand { let ParserMatchClass = VecList4QAsmOperand; let PrintMethod = "printMVEVectorList<4>"; } // taddrmode_imm7 := reg[r0-r7] +/- (imm7 << shift) class TMemImm7ShiftOffsetAsmOperand : AsmOperandClass { let Name = "TMemImm7Shift"#shift#"Offset"; let PredicateMethod = "isMemImm7ShiftedOffset<"#shift#",ARM::tGPRRegClassID>"; let RenderMethod = "addMemImmOffsetOperands"; } class taddrmode_imm7 : MemOperand, ComplexPattern", []> { let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand; // They are printed the same way as the T2 imm8 version let PrintMethod = "printT2AddrModeImm8Operand"; // This can also be the same as the T2 version. let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">"; let DecoderMethod = "DecodeTAddrModeImm7<"#shift#">"; let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); } // t2addrmode_imm7 := reg +/- (imm7) class MemImm7ShiftOffsetAsmOperand : AsmOperandClass { let Name = "MemImm7Shift"#shift#"Offset"; let PredicateMethod = "isMemImm7ShiftedOffset<" # shift # ",ARM::GPRnopcRegClassID>"; let RenderMethod = "addMemImmOffsetOperands"; } def MemImm7Shift0OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<0>; def MemImm7Shift1OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<1>; def MemImm7Shift2OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<2>; class T2AddrMode_Imm7 : MemOperand, ComplexPattern", []> { let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">"; let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 0>"; let ParserMatchClass = !cast("MemImm7Shift"#shift#"OffsetAsmOperand"); let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm); } class t2addrmode_imm7 : T2AddrMode_Imm7 { // They are printed the same way as the imm8 version let PrintMethod = "printT2AddrModeImm8Operand"; } class MemImm7ShiftOffsetWBAsmOperand : AsmOperandClass { let Name = "MemImm7Shift"#shift#"OffsetWB"; let PredicateMethod = "isMemImm7ShiftedOffset<" # shift # ",ARM::rGPRRegClassID>"; let RenderMethod = "addMemImmOffsetOperands"; } def MemImm7Shift0OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<0>; def MemImm7Shift1OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<1>; def MemImm7Shift2OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<2>; class t2addrmode_imm7_pre : T2AddrMode_Imm7 { // They are printed the same way as the imm8 version let PrintMethod = "printT2AddrModeImm8Operand"; let ParserMatchClass = !cast("MemImm7Shift"#shift#"OffsetWBAsmOperand"); let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 1>"; let MIOperandInfo = (ops rGPR:$base, i32imm:$offsim); } class t2am_imm7shiftOffsetAsmOperand : AsmOperandClass { let Name = "Imm7Shift"#shift; } def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>; def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>; def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>; class t2am_imm7_offset : MemOperand, ComplexPattern", [], [SDNPWantRoot]> { // They are printed the same way as the imm8 version let PrintMethod = "printT2AddrModeImm8OffsetOperand"; let ParserMatchClass = !cast("t2am_imm7shift"#shift#"OffsetAsmOperand"); let EncoderMethod = "getT2ScaledImmOpValue<7,"#shift#">"; let DecoderMethod = "DecodeT2Imm7<"#shift#">"; } // Operands for gather/scatter loads of the form [Rbase, Qoffsets] class MemRegRQOffsetAsmOperand : AsmOperandClass { let Name = "MemRegRQS"#shift#"Offset"; let PredicateMethod = "isMemRegRQOffset<"#shift#">"; let RenderMethod = "addMemRegRQOffsetOperands"; } def MemRegRQS0OffsetAsmOperand : MemRegRQOffsetAsmOperand<0>; def MemRegRQS1OffsetAsmOperand : MemRegRQOffsetAsmOperand<1>; def MemRegRQS2OffsetAsmOperand : MemRegRQOffsetAsmOperand<2>; def MemRegRQS3OffsetAsmOperand : MemRegRQOffsetAsmOperand<3>; // mve_addr_rq_shift := reg + vreg{ << UXTW #shift} class mve_addr_rq_shift : MemOperand { let EncoderMethod = "getMveAddrModeRQOpValue"; let PrintMethod = "printMveAddrModeRQOperand<"#shift#">"; let ParserMatchClass = !cast("MemRegRQS"#shift#"OffsetAsmOperand"); let DecoderMethod = "DecodeMveAddrModeRQ"; let MIOperandInfo = (ops GPRnopc:$base, MQPR:$offsreg); } class MemRegQOffsetAsmOperand : AsmOperandClass { let Name = "MemRegQS"#shift#"Offset"; let PredicateMethod = "isMemRegQOffset<"#shift#">"; let RenderMethod = "addMemImmOffsetOperands"; } def MemRegQS2OffsetAsmOperand : MemRegQOffsetAsmOperand<2>; def MemRegQS3OffsetAsmOperand : MemRegQOffsetAsmOperand<3>; // mve_addr_q_shift := vreg {+ #imm7s2/4} class mve_addr_q_shift : MemOperand { let EncoderMethod = "getMveAddrModeQOpValue<"#shift#">"; // Can be printed same way as other reg + imm operands let PrintMethod = "printT2AddrModeImm8Operand"; let ParserMatchClass = !cast("MemRegQS"#shift#"OffsetAsmOperand"); let DecoderMethod = "DecodeMveAddrModeQ<"#shift#">"; let MIOperandInfo = (ops MQPR:$base, i32imm:$imm); } // A family of classes wrapping up information about the vector types // used by MVE. class MVEVectorVTInfo size, string suffix, bit unsigned> { // The LLVM ValueType representing the vector, so we can use it in // ISel patterns. ValueType Vec = vec; // An LLVM ValueType representing a corresponding vector of // predicate bits, for use in ISel patterns that handle an IR // intrinsic describing the predicated form of the instruction. // // Usually, for a vector of N things, this will be vNi1. But for // vectors of 2 values, we make an exception, and use v4i1 instead // of v2i1. Rationale: MVE codegen doesn't support doing all the // auxiliary operations on v2i1 (vector shuffles etc), and also, // there's no MVE compare instruction that will _generate_ v2i1 // directly. ValueType Pred = pred; // The most common representation of the vector element size in MVE // instruction encodings: a 2-bit value V representing an (8< Size = size; // For vectors explicitly mentioning a signedness of integers: 0 for // signed and 1 for unsigned. For anything else, undefined. bit Unsigned = unsigned; // The suffix used on the instruction in assembly language. string Suffix = suffix; } // Integer vector types that don't treat signed and unsigned differently. def MVE_v16i8 : MVEVectorVTInfo; def MVE_v8i16 : MVEVectorVTInfo; def MVE_v4i32 : MVEVectorVTInfo; def MVE_v2i64 : MVEVectorVTInfo; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. def MVE_v16s8 : MVEVectorVTInfo; def MVE_v8s16 : MVEVectorVTInfo; def MVE_v4s32 : MVEVectorVTInfo; def MVE_v2s64 : MVEVectorVTInfo; def MVE_v16u8 : MVEVectorVTInfo; def MVE_v8u16 : MVEVectorVTInfo; def MVE_v4u32 : MVEVectorVTInfo; def MVE_v2u64 : MVEVectorVTInfo; // FP vector types. def MVE_v8f16 : MVEVectorVTInfo; def MVE_v4f32 : MVEVectorVTInfo; def MVE_v2f64 : MVEVectorVTInfo; // --------- Start of base classes for the instructions themselves class MVE_MI pattern> : Thumb2XI, Requires<[HasMVEInt]> { let D = MVEDomain; let DecoderNamespace = "MVE"; } // MVE_p is used for most predicated instructions, to add the cluster // of input operands that provides the VPT suffix (none, T or E) and // the input predicate register. class MVE_p pattern=[]> : MVE_MI { let Inst{31-29} = 0b111; let Inst{27-26} = 0b11; } class MVE_f pattern=[]> : MVE_p { let Predicates = [HasMVEFloat]; } class MVE_MI_with_pred pattern> : Thumb2I, Requires<[HasV8_1MMainline, HasMVEInt]> { let D = MVEDomain; let DecoderNamespace = "MVE"; } class MVE_VMOV_lane_base pattern> : Thumb2I, Requires<[HasV8_1MMainline, HasMVEInt]> { let D = MVEDomain; let DecoderNamespace = "MVE"; } class MVE_ScalarShift pattern=[]> : MVE_MI_with_pred { let Inst{31-20} = 0b111010100101; let Inst{8} = 0b1; } class MVE_ScalarShiftSingleReg pattern=[]> : MVE_ScalarShift { bits<4> RdaDest; let Inst{19-16} = RdaDest{3-0}; } class MVE_ScalarShiftSRegImm op5_4, list pattern=[]> : MVE_ScalarShiftSingleReg { bits<5> imm; let Inst{15} = 0b0; let Inst{14-12} = imm{4-2}; let Inst{11-8} = 0b1111; let Inst{7-6} = imm{1-0}; let Inst{5-4} = op5_4{1-0}; let Inst{3-0} = 0b1111; } def MVE_SQSHL : MVE_ScalarShiftSRegImm<"sqshl", 0b11>; def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>; def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>; def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>; class MVE_ScalarShiftSRegReg op5_4, list pattern=[]> : MVE_ScalarShiftSingleReg { bits<4> Rm; let Inst{15-12} = Rm{3-0}; let Inst{11-8} = 0b1111; let Inst{7-6} = 0b00; let Inst{5-4} = op5_4{1-0}; let Inst{3-0} = 0b1101; let Unpredictable{8-6} = 0b111; } def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>; def MVE_UQRSHL : MVE_ScalarShiftSRegReg<"uqrshl", 0b00>; class MVE_ScalarShiftDoubleReg pattern=[]> : MVE_ScalarShift { bits<4> RdaLo; bits<4> RdaHi; let Inst{19-17} = RdaLo{3-1}; let Inst{11-9} = RdaHi{3-1}; } class MVE_ScalarShiftDRegImm op5_4, bit op16, list pattern=[]> : MVE_ScalarShiftDoubleReg< iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, long_shift:$imm), "$RdaLo, $RdaHi, $imm", "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", pattern> { bits<5> imm; let Inst{16} = op16; let Inst{15} = 0b0; let Inst{14-12} = imm{4-2}; let Inst{7-6} = imm{1-0}; let Inst{5-4} = op5_4{1-0}; let Inst{3-0} = 0b1111; } class MVE_ScalarShiftDRegRegBase pattern=[]> : MVE_ScalarShiftDoubleReg< iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo," "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", pattern> { bits<4> Rm; let Inst{16} = op16; let Inst{15-12} = Rm{3-0}; let Inst{6} = 0b0; let Inst{5} = op5; let Inst{4} = 0b0; let Inst{3-0} = 0b1101; // Custom decoder method because of the following overlapping encodings: // ASRL and SQRSHR // LSLL and UQRSHL // SQRSHRL and SQRSHR // UQRSHLL and UQRSHL let DecoderMethod = "DecodeMVEOverlappingLongShift"; } class MVE_ScalarShiftDRegReg pattern=[]> : MVE_ScalarShiftDRegRegBase< iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> { let Inst{7} = 0b0; } class MVE_ScalarShiftDRegRegWithSat pattern=[]> : MVE_ScalarShiftDRegRegBase< iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat), "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> { bit sat; let Inst{7} = sat; } def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsrl tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>; def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>; def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>; def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>; def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>; def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>; // start of mve_rDest instructions class MVE_rDest pattern=[]> // Always use vpred_n and not vpred_r: with the output register being // a GPR and not a vector register, there can't be any question of // what to put in its inactive lanes. : MVE_p { let Inst{25-23} = 0b101; let Inst{11-9} = 0b111; let Inst{4} = 0b0; } class MVE_VABAV size, list pattern=[]> : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm), NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src", pattern> { bits<4> Qm; bits<4> Qn; bits<4> Rda; let Inst{28} = U; let Inst{22} = 0b0; let Inst{21-20} = size{1-0}; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{15-12} = Rda{3-0}; let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; } def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>; def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>; def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>; def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>; def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>; def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>; class MVE_VADDV size, list pattern=[]> : MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary, iname, suffix, "$Rda, $Qm", cstr, pattern> { bits<3> Qm; bits<4> Rda; let Inst{28} = U; let Inst{22-20} = 0b111; let Inst{19-18} = size{1-0}; let Inst{17-16} = 0b01; let Inst{15-13} = Rda{3-1}; let Inst{12} = 0b0; let Inst{8-6} = 0b100; let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; } multiclass MVE_VADDV_A size, list pattern=[]> { def acc : MVE_VADDV<"vaddva", suffix, (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", 0b1, U, size, pattern>; def no_acc : MVE_VADDV<"vaddv", suffix, (ins MQPR:$Qm), "", 0b0, U, size, pattern>; } defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), (i32 (MVE_VADDVu32acc $src2, $src1))>; def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), (i32 (MVE_VADDVu16acc $src2, $src1))>; def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), (i32 (MVE_VADDVu8acc $src2, $src1))>; } class MVE_VADDLV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> { bits<3> Qm; bits<4> RdaLo; bits<4> RdaHi; let Inst{28} = U; let Inst{22-20} = RdaHi{3-1}; let Inst{19-18} = 0b10; let Inst{17-16} = 0b01; let Inst{15-13} = RdaLo{3-1}; let Inst{12} = 0b0; let Inst{8-6} = 0b100; let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; } multiclass MVE_VADDLV_A pattern=[]> { def acc : MVE_VADDLV<"vaddlva", suffix, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", 0b1, U, pattern>; def no_acc : MVE_VADDLV<"vaddlv", suffix, (ins MQPR:$Qm), "", 0b0, U, pattern>; } defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; class MVE_VMINMAXNMV pattern=[]> : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary, iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> { bits<3> Qm; bits<4> RdaDest; let Inst{28} = sz; let Inst{22-20} = 0b110; let Inst{19-18} = 0b11; let Inst{17} = bit_17; let Inst{16} = 0b0; let Inst{15-12} = RdaDest{3-0}; let Inst{8} = 0b1; let Inst{7} = bit_7; let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; let Predicates = [HasMVEFloat]; } multiclass MVE_VMINMAXNMV_fty pattern=[]> { def f32 : MVE_VMINMAXNMV; def f16 : MVE_VMINMAXNMV; } defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>; defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>; multiclass MVE_VMINMAXNMAV_fty pattern=[]> { def f32 : MVE_VMINMAXNMV; def f16 : MVE_VMINMAXNMV; } defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>; defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>; class MVE_VMINMAXV size, bit bit_17, bit bit_7, list pattern=[]> : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary, iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> { bits<3> Qm; bits<4> RdaDest; let Inst{28} = U; let Inst{22-20} = 0b110; let Inst{19-18} = size{1-0}; let Inst{17} = bit_17; let Inst{16} = 0b0; let Inst{15-12} = RdaDest{3-0}; let Inst{8} = 0b1; let Inst{7} = bit_7; let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; } multiclass MVE_VMINMAXV_p { def "": MVE_VMINMAXV; let Predicates = [HasMVEInt] in def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), (i32 (!cast(NAME) (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; } multiclass MVE_VMINMAXV_ty { defm s8 : MVE_VMINMAXV_p; defm s16: MVE_VMINMAXV_p; defm s32: MVE_VMINMAXV_p; defm u8 : MVE_VMINMAXV_p; defm u16: MVE_VMINMAXV_p; defm u32: MVE_VMINMAXV_p; } defm MVE_VMINV : MVE_VMINMAXV_ty< "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; defm MVE_VMAXV : MVE_VMINMAXV_ty< "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>; def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))), (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>; def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))), (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>; def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))), (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>; def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))), (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>; def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))), (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>; def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))), (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>; def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))), (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>; def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))), (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>; def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))), (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>; def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))), (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>; def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))), (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>; } multiclass MVE_VMINMAXAV_ty pattern=[]> { def s8 : MVE_VMINMAXV; def s16 : MVE_VMINMAXV; def s32 : MVE_VMINMAXV; } defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; class MVE_VMLAMLSDAV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix, "$RdaDest, $Qn, $Qm", cstr, pattern> { bits<4> RdaDest; bits<3> Qm; bits<3> Qn; let Inst{28} = bit_28; let Inst{22-20} = 0b111; let Inst{19-17} = Qn{2-0}; let Inst{16} = sz; let Inst{15-13} = RdaDest{3-1}; let Inst{12} = X; let Inst{8} = bit_8; let Inst{7-6} = 0b00; let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; } multiclass MVE_VMLAMLSDAV_A pattern=[]> { def ""#x#suffix : MVE_VMLAMLSDAV; def "a"#x#suffix : MVE_VMLAMLSDAV; } multiclass MVE_VMLAMLSDAV_AX pattern=[]> { defm "" : MVE_VMLAMLSDAV_A; defm "" : MVE_VMLAMLSDAV_A; } multiclass MVE_VMLADAV_multi pattern=[]> { defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix, sz, 0b0, bit_8, 0b0, pattern>; defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix, sz, 0b1, 0b0, bit_8, 0b0, pattern>; } multiclass MVE_VMLSDAV_multi pattern=[]> { defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix, sz, bit_28, 0b0, 0b1, pattern>; } defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>; defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>; defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>; // vmlav aliases vmladav foreach acc = ["", "a"] in { foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm", (!cast("MVE_VMLADAV"#acc#suffix) tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH class MVE_VMLALDAVBase pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary, iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> { bits<4> RdaLoDest; bits<4> RdaHiDest; bits<3> Qm; bits<3> Qn; let Inst{28} = bit_28; let Inst{22-20} = RdaHiDest{3-1}; let Inst{19-17} = Qn{2-0}; let Inst{16} = sz; let Inst{15-13} = RdaLoDest{3-1}; let Inst{12} = X; let Inst{8} = bit_8; let Inst{7-6} = 0b00; let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; } multiclass MVE_VMLALDAVBase_A pattern=[]> { def ""#x#suffix : MVE_VMLALDAVBase< iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; def "a"#x#suffix : MVE_VMLALDAVBase< iname # "a" # x, suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm), "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; } multiclass MVE_VMLALDAVBase_AX pattern=[]> { defm "" : MVE_VMLALDAVBase_A; defm "" : MVE_VMLALDAVBase_A; } multiclass MVE_VRMLALDAVH_multi pattern=[]> { defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix, 0b0, 0b0, 0b1, 0b0, pattern>; defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix, 0b0, 0b1, 0b0, 0b1, 0b0, pattern>; } defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">; // vrmlalvh aliases for vrmlaldavh def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", (MVE_VRMLALDAVHs32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", (MVE_VRMLALDAVHas32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", (MVE_VRMLALDAVHu32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", (MVE_VRMLALDAVHau32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; multiclass MVE_VMLALDAV_multi pattern=[]> { defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>; defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix, sz, 0b1, 0b0, 0b0, 0b0, pattern>; } defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>; defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>; // vmlalv aliases vmlaldav foreach acc = ["", "a"] in { foreach suffix = ["s16", "s32", "u16", "u32"] in { def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix # "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm", (!cast("MVE_VMLALDAV"#acc#suffix) tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } multiclass MVE_VMLSLDAV_multi pattern=[]> { defm "" : MVE_VMLALDAVBase_AX; } defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; // end of mve_rDest instructions // start of mve_comp instructions class MVE_comp pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix, "$Qd, $Qn, $Qm", vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bits<4> Qm; let Inst{22} = Qd{3}; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{15-13} = Qd{2-0}; let Inst{12} = 0b0; let Inst{10-9} = 0b11; let Inst{7} = Qn{3}; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; } class MVE_VMINMAXNM pattern=[]> : MVE_comp { let Inst{28} = 0b1; let Inst{25-24} = 0b11; let Inst{23} = 0b0; let Inst{21} = bit_21; let Inst{20} = sz; let Inst{11} = 0b1; let Inst{8} = 0b1; let Inst{6} = 0b1; let Inst{4} = 0b1; let Predicates = [HasMVEFloat]; } def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>; def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>; let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>; def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>; let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } class MVE_VMINMAX size, bit bit_4, list pattern=[]> : MVE_comp { let Inst{28} = U; let Inst{25-24} = 0b11; let Inst{23} = 0b0; let Inst{21-20} = size{1-0}; let Inst{11} = 0b0; let Inst{8} = 0b0; let Inst{6} = 0b1; let Inst{4} = bit_4; } multiclass MVE_VMINMAX_all_sizes { def s8 : MVE_VMINMAX; def s16 : MVE_VMINMAX; def s32 : MVE_VMINMAX; def u8 : MVE_VMINMAX; def u16 : MVE_VMINMAX; def u32 : MVE_VMINMAX; } defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>; defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; } // end of mve_comp instructions // start of mve_bit instructions class MVE_bit_arith pattern=[]> : MVE_p { bits<4> Qd; bits<4> Qm; let Inst{22} = Qd{3}; let Inst{15-13} = Qd{2-0}; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; } def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "vbic", "", "$Qd, $Qn, $Qm", ""> { bits<4> Qn; let Inst{28} = 0b0; let Inst{25-23} = 0b110; let Inst{21-20} = 0b01; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12-8} = 0b00001; let Inst{7} = Qn{3}; let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; let validForTailPredication = 1; } class MVE_VREV size, bits<2> bit_8_7, string cstr=""> : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, suffix, "$Qd, $Qm", cstr> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b00; let Inst{12-9} = 0b0000; let Inst{8-7} = bit_8_7; let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; } def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">; def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">; def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">; def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>; def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))), (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>; def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))), (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>; } let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>; def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))), (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>; def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))), (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>; def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))), (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>; def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))), (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>; def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))), (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>; def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))), (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>; def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))), (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>; } def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), "vmvn", "", "$Qd, $Qm", ""> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-16} = 0b110000; let Inst{12-6} = 0b0010111; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))), (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>; def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))), (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>; def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))), (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>; def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))), (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>; } class MVE_bit_ops bit_21_20, bit bit_28> : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), iname, "", "$Qd, $Qn, $Qm", ""> { bits<4> Qn; let Inst{28} = bit_28; let Inst{25-23} = 0b110; let Inst{21-20} = bit_21_20; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12-8} = 0b00001; let Inst{7} = Qn{3}; let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>; def MVE_VORN : MVE_bit_ops<"vorn", 0b11, 0b0>; def MVE_VORR : MVE_bit_ops<"vorr", 0b10, 0b0>; def MVE_VAND : MVE_bit_ops<"vand", 0b00, 0b0>; // add ignored suffixes as aliases foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f32"] in { def : MVEInstAlias<"vbic${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", (MVE_VBIC MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; def : MVEInstAlias<"veor${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", (MVE_VEOR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; def : MVEInstAlias<"vorn${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", (MVE_VORN MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; def : MVEInstAlias<"vorr${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", (MVE_VORR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; def : MVEInstAlias<"vand${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc", (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; } let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))), (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))), (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))), (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))), (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))), (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>; } class MVE_bit_cmode cmode, dag inOps> : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary, iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> { bits<8> imm; bits<4> Qd; let Inst{28} = imm{7}; let Inst{27-23} = 0b11111; let Inst{22} = Qd{3}; let Inst{21-19} = 0b000; let Inst{18-16} = imm{6-4}; let Inst{15-13} = Qd{2-0}; let Inst{12} = 0b0; let Inst{11-8} = cmode; let Inst{7-6} = 0b01; let Inst{4} = 0b1; let Inst{3-0} = imm{3-0}; } class MVE_VORR cmode, ExpandImm imm_type> : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b0; let validForTailPredication = 1; } def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>; def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>; def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>; def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>; def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>; def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; class MVE_VBIC cmode, ExpandImm imm_type> : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b1; let validForTailPredication = 1; } def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>; def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>; def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>; def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>; def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>; def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; class MVE_VMOV_lane_direction { bit bit_20; dag oops; dag iops; string ops; string cstr; } def MVE_VMOV_from_lane : MVE_VMOV_lane_direction { let bit_20 = 0b1; let oops = (outs rGPR:$Rt); let iops = (ins MQPR:$Qd); let ops = "$Rt, $Qd$Idx"; let cstr = ""; } def MVE_VMOV_to_lane : MVE_VMOV_lane_direction { let bit_20 = 0b0; let oops = (outs MQPR:$Qd); let iops = (ins MQPR:$Qd_src, rGPR:$Rt); let ops = "$Qd$Idx, $Rt"; let cstr = "$Qd = $Qd_src"; } class MVE_VMOV_lane : MVE_VMOV_lane_base { bits<4> Qd; bits<4> Rt; let Inst{31-24} = 0b11101110; let Inst{23} = U; let Inst{20} = dir.bit_20; let Inst{19-17} = Qd{2-0}; let Inst{15-12} = Rt{3-0}; let Inst{11-8} = 0b1011; let Inst{7} = Qd{3}; let Inst{4-0} = 0b10000; } class MVE_VMOV_lane_32 : MVE_VMOV_lane<"32", 0b0, (ins MVEVectorIndex<4>:$Idx), dir> { bits<2> Idx; let Inst{22} = 0b0; let Inst{6-5} = 0b00; let Inst{16} = Idx{1}; let Inst{21} = Idx{0}; let Predicates = [HasFPRegsV8_1M]; } class MVE_VMOV_lane_16 : MVE_VMOV_lane:$Idx), dir> { bits<3> Idx; let Inst{22} = 0b0; let Inst{5} = 0b1; let Inst{16} = Idx{2}; let Inst{21} = Idx{1}; let Inst{6} = Idx{0}; } class MVE_VMOV_lane_8 : MVE_VMOV_lane:$Idx), dir> { bits<4> Idx; let Inst{22} = 0b1; let Inst{16} = Idx{3}; let Inst{21} = Idx{2}; let Inst{6} = Idx{1}; let Inst{5} = Idx{0}; } def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>; def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>; def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>; def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>; def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>; def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>; def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>; def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>; let Predicates = [HasMVEInt] in { def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane), (f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>; def : Pat<(insertelt (v2f64 MQPR:$src1), DPR:$src2, imm:$lane), (INSERT_SUBREG (v2f64 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), DPR:$src2, (DSubReg_f64_reg imm:$lane))>; def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane), (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>; def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane), (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>; def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane), (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>; def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>; def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; def : Pat<(v16i8 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; def : Pat<(v8i16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; def : Pat<(v4i32 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; // Floating point patterns, still enabled under HasMVEInt def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane), (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>; def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), (COPY_TO_REGCLASS (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))), HPR)>; def : Pat<(v4f32 (scalar_to_vector SPR:$src)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; def : Pat<(v4f32 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; def : Pat<(v8f16 (scalar_to_vector HPR:$src)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>; def : Pat<(v8f16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; } // end of mve_bit instructions // start of MVE Integer instructions class MVE_int size, list pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> { bits<4> Qd; bits<4> Qn; bits<4> Qm; let Inst{22} = Qd{3}; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{7} = Qn{3}; let Inst{6} = 0b1; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; } class MVE_VMULt1 size, list pattern=[]> : MVE_int<"vmul", suffix, size, pattern> { let Inst{28} = 0b0; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-8} = 0b01001; let Inst{4} = 0b1; let Inst{0} = 0b0; } def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>; def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>; def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; } class MVE_VQxDMULH size, bit rounding, list pattern=[]> : MVE_int { let Inst{28} = rounding; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-8} = 0b01011; let Inst{4} = 0b0; let Inst{0} = 0b0; } class MVE_VQDMULH size, list pattern=[]> : MVE_VQxDMULH<"vqdmulh", suffix, size, 0b0, pattern>; class MVE_VQRDMULH size, list pattern=[]> : MVE_VQxDMULH<"vqrdmulh", suffix, size, 0b1, pattern>; def MVE_VQDMULHi8 : MVE_VQDMULH<"s8", 0b00>; def MVE_VQDMULHi16 : MVE_VQDMULH<"s16", 0b01>; def MVE_VQDMULHi32 : MVE_VQDMULH<"s32", 0b10>; def MVE_VQRDMULHi8 : MVE_VQRDMULH<"s8", 0b00>; def MVE_VQRDMULHi16 : MVE_VQRDMULH<"s16", 0b01>; def MVE_VQRDMULHi32 : MVE_VQRDMULH<"s32", 0b10>; class MVE_VADDSUB size, bit subtract, list pattern=[]> : MVE_int { let Inst{28} = subtract; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-8} = 0b01000; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } multiclass MVE_VADDSUB_m { def "" : MVE_VADDSUB; let Predicates = [HasMVEInt] in { // Unpredicated add/subtract def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; // Predicated add/subtract def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 1), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))>; } } multiclass MVE_VADD : MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; multiclass MVE_VSUB : MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; defm MVE_VADDi8 : MVE_VADD; defm MVE_VADDi16 : MVE_VADD; defm MVE_VADDi32 : MVE_VADD; defm MVE_VSUBi8 : MVE_VSUB; defm MVE_VSUBi16 : MVE_VSUB; defm MVE_VSUBi32 : MVE_VSUB; class MVE_VQADDSUB size, ValueType vt> : MVE_int { let Inst{28} = U; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-10} = 0b000; let Inst{9} = subtract; let Inst{8} = 0b0; let Inst{4} = 0b1; let Inst{0} = 0b0; let validForTailPredication = 1; ValueType VT = vt; } class MVE_VQADD size, ValueType VT> : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>; class MVE_VQSUB size, ValueType VT> : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>; def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>; def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>; def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>; def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>; def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>; def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>; def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>; def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>; def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>; def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>; def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>; def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>; let Predicates = [HasMVEInt] in { foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in foreach VT = [instr.VT] in def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in foreach VT = [instr.VT] in def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in foreach VT = [instr.VT] in def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in foreach VT = [instr.VT] in def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; } class MVE_VABD_int size, list pattern=[]> : MVE_int<"vabd", suffix, size, pattern> { let Inst{28} = U; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-8} = 0b00111; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>; def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>; def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>; def MVE_VABDu8 : MVE_VABD_int<"u8", 0b1, 0b00>; def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>; def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>; class MVE_VRHADD size, list pattern=[]> : MVE_int<"vrhadd", suffix, size, pattern> { let Inst{28} = U; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-8} = 0b00001; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>; def MVE_VRHADDs16 : MVE_VRHADD<"s16", 0b0, 0b01>; def MVE_VRHADDs32 : MVE_VRHADD<"s32", 0b0, 0b10>; def MVE_VRHADDu8 : MVE_VRHADD<"u8", 0b1, 0b00>; def MVE_VRHADDu16 : MVE_VRHADD<"u16", 0b1, 0b01>; def MVE_VRHADDu32 : MVE_VRHADD<"u32", 0b1, 0b10>; class MVE_VHADDSUB size, list pattern=[]> : MVE_int { let Inst{28} = U; let Inst{25-23} = 0b110; let Inst{16} = 0b0; let Inst{12-10} = 0b000; let Inst{9} = subtract; let Inst{8} = 0b0; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } class MVE_VHADD size, list pattern=[]> : MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>; class MVE_VHSUB size, list pattern=[]> : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; def MVE_VHADDs8 : MVE_VHADD<"s8", 0b0, 0b00>; def MVE_VHADDs16 : MVE_VHADD<"s16", 0b0, 0b01>; def MVE_VHADDs32 : MVE_VHADD<"s32", 0b0, 0b10>; def MVE_VHADDu8 : MVE_VHADD<"u8", 0b1, 0b00>; def MVE_VHADDu16 : MVE_VHADD<"u16", 0b1, 0b01>; def MVE_VHADDu32 : MVE_VHADD<"u32", 0b1, 0b10>; def MVE_VHSUBs8 : MVE_VHSUB<"s8", 0b0, 0b00>; def MVE_VHSUBs16 : MVE_VHSUB<"s16", 0b0, 0b01>; def MVE_VHSUBs32 : MVE_VHSUB<"s32", 0b0, 0b10>; def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (ARMvshrsImm (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), (v16i8 (MVE_VHADDs8 (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; def : Pat<(v8i16 (ARMvshrsImm (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), (v8i16 (MVE_VHADDs16 (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; def : Pat<(v4i32 (ARMvshrsImm (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), (v4i32 (MVE_VHADDs32 (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; def : Pat<(v16i8 (ARMvshruImm (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), (v16i8 (MVE_VHADDu8 (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; def : Pat<(v8i16 (ARMvshruImm (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), (v8i16 (MVE_VHADDu16 (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; def : Pat<(v4i32 (ARMvshruImm (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), (v4i32 (MVE_VHADDu32 (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; def : Pat<(v16i8 (ARMvshrsImm (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), (v16i8 (MVE_VHSUBs8 (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; def : Pat<(v8i16 (ARMvshrsImm (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), (v8i16 (MVE_VHSUBs16 (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; def : Pat<(v4i32 (ARMvshrsImm (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), (v4i32 (MVE_VHSUBs32 (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; def : Pat<(v16i8 (ARMvshruImm (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), (v16i8 (MVE_VHSUBu8 (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; def : Pat<(v8i16 (ARMvshruImm (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), (v8i16 (MVE_VHSUBu16 (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; def : Pat<(v4i32 (ARMvshruImm (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), (v4i32 (MVE_VHSUBu32 (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; } class MVE_VDUP pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { bits<4> Qd; bits<4> Rt; let Inst{28} = 0b0; let Inst{25-23} = 0b101; let Inst{22} = B; let Inst{21-20} = 0b10; let Inst{19-17} = Qd{2-0}; let Inst{16} = 0b0; let Inst{15-12} = Rt; let Inst{11-8} = 0b1011; let Inst{7} = Qd{3}; let Inst{6} = 0b0; let Inst{5} = E; let Inst{4-0} = 0b10000; let validForTailPredication = 1; } def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>; def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>; def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP8 rGPR:$elem)>; def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP16 rGPR:$elem)>; def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP32 rGPR:$elem)>; def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; // For the 16-bit and 8-bit vduplanes we don't care about the signedness // of the lane move operation as we only want the lowest 8/16 bits anyway. def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; } class MVEIntSingleSrc size, list pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary, iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> { bits<4> Qd; bits<4> Qm; let Inst{22} = Qd{3}; let Inst{19-18} = size{1-0}; let Inst{15-13} = Qd{2-0}; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; } class MVE_VCLSCLZ size, bit count_zeroes, list pattern=[]> : MVEIntSingleSrc { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; let Inst{17-16} = 0b00; let Inst{12-8} = 0b00100; let Inst{7} = count_zeroes; let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>; def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>; def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))), (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>; def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))), (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>; def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))), (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>; } class MVE_VABSNEG_int size, bit negate, list pattern=[]> : MVEIntSingleSrc { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; let Inst{17-16} = 0b01; let Inst{12-8} = 0b00011; let Inst{7} = negate; let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>; def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (abs (v16i8 MQPR:$v))), (v16i8 (MVE_VABSs8 $v))>; def : Pat<(v8i16 (abs (v8i16 MQPR:$v))), (v8i16 (MVE_VABSs16 $v))>; def : Pat<(v4i32 (abs (v4i32 MQPR:$v))), (v4i32 (MVE_VABSs32 $v))>; } def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>; def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>; def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))), (v16i8 (MVE_VNEGs8 $v))>; def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))), (v8i16 (MVE_VNEGs16 $v))>; def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))), (v4i32 (MVE_VNEGs32 $v))>; } class MVE_VQABSNEG size, bit negate, list pattern=[]> : MVEIntSingleSrc { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; let Inst{17-16} = 0b00; let Inst{12-8} = 0b00111; let Inst{7} = negate; let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; let validForTailPredication = 1; } def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>; def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>; def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>; def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>; def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>; class MVE_mod_imm cmode, bit op, dag iops, list pattern=[]> : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm", vpred_r, "", pattern> { bits<13> imm; bits<4> Qd; let Inst{28} = imm{7}; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21-19} = 0b000; let Inst{18-16} = imm{6-4}; let Inst{15-13} = Qd{2-0}; let Inst{12} = 0b0; let Inst{11-8} = cmode{3-0}; let Inst{7-6} = 0b01; let Inst{5} = op; let Inst{4} = 0b1; let Inst{3-0} = imm{3-0}; let DecoderMethod = "DecodeMVEModImmInstruction"; let validForTailPredication = 1; } let isReMaterializable = 1 in { let isAsCheapAsAMove = 1 in { def MVE_VMOVimmi8 : MVE_mod_imm<"vmov", "i8", {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>; def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> { let Inst{9} = imm{9}; } def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> { let Inst{11-8} = imm{11-8}; } def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>; def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>; } // let isAsCheapAsAMove = 1 def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> { let Inst{9} = imm{9}; } def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> { let Inst{11-8} = imm{11-8}; } } // let isReMaterializable = 1 let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (ARMvmovImm timm:$simm)), (v16i8 (MVE_VMOVimmi8 nImmSplatI8:$simm))>; def : Pat<(v8i16 (ARMvmovImm timm:$simm)), (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>; def : Pat<(v4i32 (ARMvmovImm timm:$simm)), (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>; def : Pat<(v8i16 (ARMvmvnImm timm:$simm)), (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>; def : Pat<(v4i32 (ARMvmvnImm timm:$simm)), (v4i32 (MVE_VMVNimmi32 nImmVMOVI32:$simm))>; def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)), (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>; } class MVE_VMINMAXA size, bit bit_12, list pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> { bits<4> Qd; bits<4> Qm; let Inst{28} = 0b0; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b11; let Inst{15-13} = Qd{2-0}; let Inst{12} = bit_12; let Inst{11-6} = 0b111010; let Inst{5} = Qm{3}; let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; } def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>; def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>; def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>; def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>; def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>; def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>; // end of MVE Integer instructions // start of mve_imm_shift instructions def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd), (ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm), NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm", vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> { bits<5> imm; bits<4> Qd; bits<4> RdmDest; let Inst{28} = 0b0; let Inst{25-23} = 0b101; let Inst{22} = Qd{3}; let Inst{21} = 0b1; let Inst{20-16} = imm{4-0}; let Inst{15-13} = Qd{2-0}; let Inst{12-4} = 0b011111100; let Inst{3-0} = RdmDest{3-0}; } class MVE_shift_imm pattern=[]> : MVE_p { bits<4> Qd; bits<4> Qm; let Inst{22} = Qd{3}; let Inst{15-13} = Qd{2-0}; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; } class MVE_VMOVL sz, bit U, list pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> { let Inst{28} = U; let Inst{25-23} = 0b101; let Inst{21} = 0b1; let Inst{20-19} = sz{1-0}; let Inst{18-16} = 0b000; let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = 0b0; } multiclass MVE_VMOVL_shift_half sz, bit U, list pattern=[]> { def bh : MVE_VMOVL { let Inst{12} = 0b0; } def th : MVE_VMOVL { let Inst{12} = 0b1; } } defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>; defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>; defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>; defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16), (MVE_VMOVLs16bh MQPR:$src)>; def : Pat<(sext_inreg (v8i16 MQPR:$src), v8i8), (MVE_VMOVLs8bh MQPR:$src)>; def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8), (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>; // zext_inreg 16 -> 32 def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), (MVE_VMOVLu16bh MQPR:$src)>; // zext_inreg 8 -> 16 def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))), (MVE_VMOVLu8bh MQPR:$src)>; } class MVE_VSHLL_imm pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops), iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> { let Inst{28} = U; let Inst{25-23} = 0b101; let Inst{21} = 0b1; let Inst{12} = th; let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = 0b0; } // The immediate VSHLL instructions accept shift counts from 1 up to // the lane width (8 or 16), but the full-width shifts have an // entirely separate encoding, given below with 'lw' in the name. class MVE_VSHLL_imm8 pattern=[]> : MVE_VSHLL_imm { bits<3> imm; let Inst{20-19} = 0b01; let Inst{18-16} = imm; } class MVE_VSHLL_imm16 pattern=[]> : MVE_VSHLL_imm { bits<4> imm; let Inst{20} = 0b1; let Inst{19-16} = imm; } def MVE_VSHLL_imms8bh : MVE_VSHLL_imm8 <"vshllb", "s8", 0b0, 0b0>; def MVE_VSHLL_imms8th : MVE_VSHLL_imm8 <"vshllt", "s8", 0b0, 0b1>; def MVE_VSHLL_immu8bh : MVE_VSHLL_imm8 <"vshllb", "u8", 0b1, 0b0>; def MVE_VSHLL_immu8th : MVE_VSHLL_imm8 <"vshllt", "u8", 0b1, 0b1>; def MVE_VSHLL_imms16bh : MVE_VSHLL_imm16<"vshllb", "s16", 0b0, 0b0>; def MVE_VSHLL_imms16th : MVE_VSHLL_imm16<"vshllt", "s16", 0b0, 0b1>; def MVE_VSHLL_immu16bh : MVE_VSHLL_imm16<"vshllb", "u16", 0b1, 0b0>; def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>; class MVE_VSHLL_by_lane_width size, bit U, string ops, list pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, suffix, ops, vpred_r, "", pattern> { let Inst{28} = U; let Inst{25-23} = 0b100; let Inst{21-20} = 0b11; let Inst{19-18} = size{1-0}; let Inst{17-16} = 0b01; let Inst{11-6} = 0b111000; let Inst{4} = 0b0; let Inst{0} = 0b1; } multiclass MVE_VSHLL_lw sz, bit U, string ops, list pattern=[]> { def bh : MVE_VSHLL_by_lane_width { let Inst{12} = 0b0; } def th : MVE_VSHLL_by_lane_width { let Inst{12} = 0b1; } } defm MVE_VSHLL_lws8 : MVE_VSHLL_lw<"vshll", "s8", 0b00, 0b0, "$Qd, $Qm, #8">; defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">; defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">; defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">; class MVE_VxSHRN pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", pattern> { bits<5> imm; let Inst{28} = bit_28; let Inst{25-23} = 0b101; let Inst{21} = 0b0; let Inst{20-16} = imm{4-0}; let Inst{12} = bit_12; let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b1; } def MVE_VRSHRNi16bh : MVE_VxSHRN< "vrshrnb", "i16", 0b0, 0b1, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VRSHRNi16th : MVE_VxSHRN< "vrshrnt", "i16", 0b1, 0b1,(ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VRSHRNi32bh : MVE_VxSHRN< "vrshrnb", "i32", 0b0, 0b1, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VRSHRNi32th : MVE_VxSHRN< "vrshrnt", "i32", 0b1, 0b1, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VSHRNi16bh : MVE_VxSHRN< "vshrnb", "i16", 0b0, 0b0, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VSHRNi16th : MVE_VxSHRN< "vshrnt", "i16", 0b1, 0b0, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VSHRNi32bh : MVE_VxSHRN< "vshrnb", "i32", 0b0, 0b0, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VSHRNi32th : MVE_VxSHRN< "vshrnt", "i32", 0b1, 0b0, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } class MVE_VxQRSHRUN pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", pattern> { bits<5> imm; let Inst{28} = bit_28; let Inst{25-23} = 0b101; let Inst{21} = 0b0; let Inst{20-16} = imm{4-0}; let Inst{12} = bit_12; let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b0; } def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN< "vqrshrunb", "s16", 0b1, 0b0, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN< "vqrshrunt", "s16", 0b1, 0b1, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN< "vqrshrunb", "s32", 0b1, 0b0, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN< "vqrshrunt", "s32", 0b1, 0b1, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN< "vqshrunb", "s16", 0b0, 0b0, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VQSHRUNs16th : MVE_VxQRSHRUN< "vqshrunt", "s16", 0b0, 0b1, (ins shr_imm8:$imm)> { let Inst{20-19} = 0b01; } def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN< "vqshrunb", "s32", 0b0, 0b0, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } def MVE_VQSHRUNs32th : MVE_VxQRSHRUN< "vqshrunt", "s32", 0b0, 0b1, (ins shr_imm16:$imm)> { let Inst{20} = 0b1; } class MVE_VxQRSHRN pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops), iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", pattern> { bits<5> imm; let Inst{25-23} = 0b101; let Inst{21} = 0b0; let Inst{20-16} = imm{4-0}; let Inst{12} = bit_12; let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = bit_0; } multiclass MVE_VxQRSHRN_types { def s16 : MVE_VxQRSHRN { let Inst{28} = 0b0; let Inst{20-19} = 0b01; } def u16 : MVE_VxQRSHRN { let Inst{28} = 0b1; let Inst{20-19} = 0b01; } def s32 : MVE_VxQRSHRN { let Inst{28} = 0b0; let Inst{20} = 0b1; } def u32 : MVE_VxQRSHRN { let Inst{28} = 0b1; let Inst{20} = 0b1; } } defm MVE_VQRSHRNbh : MVE_VxQRSHRN_types<"vqrshrnb", 0b1, 0b0>; defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>; defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>; defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>; // end of mve_imm_shift instructions // start of mve_shift instructions class MVE_shift_by_vec size, bit bit_4, bit bit_8> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary, iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> { // Shift instructions which take a vector of shift counts bits<4> Qd; bits<4> Qm; bits<4> Qn; let Inst{28} = U; let Inst{25-24} = 0b11; let Inst{23} = 0b0; let Inst{22} = Qd{3}; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{15-13} = Qd{2-0}; let Inst{12-9} = 0b0010; let Inst{8} = bit_8; let Inst{7} = Qn{3}; let Inst{6} = 0b1; let Inst{5} = Qm{3}; let Inst{4} = bit_4; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; let validForTailPredication = 1; } multiclass mve_shift_by_vec_multi { def s8 : MVE_shift_by_vec; def s16 : MVE_shift_by_vec; def s32 : MVE_shift_by_vec; def u8 : MVE_shift_by_vec; def u16 : MVE_shift_by_vec; def u32 : MVE_shift_by_vec; } defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>; defm MVE_VQSHL_by_vec : mve_shift_by_vec_multi<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_by_vec : mve_shift_by_vec_multi<"vqrshl", 0b1, 0b1>; defm MVE_VRSHL_by_vec : mve_shift_by_vec_multi<"vrshl", 0b0, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), (v4i32 (MVE_VSHL_by_vecu32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), (v8i16 (MVE_VSHL_by_vecu16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), (v16i8 (MVE_VSHL_by_vecu8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))), (v4i32 (MVE_VSHL_by_vecs32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>; def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))), (v8i16 (MVE_VSHL_by_vecs16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>; def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))), (v16i8 (MVE_VSHL_by_vecs8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>; } class MVE_shift_with_imm pattern=[]> : MVE_p { bits<4> Qd; bits<4> Qm; let Inst{23} = 0b1; let Inst{22} = Qd{3}; let Inst{15-13} = Qd{2-0}; let Inst{12-11} = 0b00; let Inst{7-6} = 0b01; let Inst{5} = Qm{3}; let Inst{4} = 0b1; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; let validForTailPredication = 1; } class MVE_VSxI_imm : MVE_shift_with_imm { bits<6> imm; let Inst{28} = 0b1; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-9} = 0b10; let Inst{8} = bit_8; let validForTailPredication = 1; } def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> { let Inst{21-19} = 0b001; } def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, (ins shr_imm16:$imm)> { let Inst{21-20} = 0b01; } def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, (ins shr_imm32:$imm)> { let Inst{21} = 0b1; } def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, (ins imm0_7:$imm)> { let Inst{21-19} = 0b001; } def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, (ins imm0_15:$imm)> { let Inst{21-20} = 0b01; } def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> { let Inst{21} = 0b1; } class MVE_VQSHL_imm : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b111; } def MVE_VSLIimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> { let Inst{28} = 0b0; let Inst{21-19} = 0b001; } def MVE_VSLIimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> { let Inst{28} = 0b1; let Inst{21-19} = 0b001; } def MVE_VSLIimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> { let Inst{28} = 0b0; let Inst{21-20} = 0b01; } def MVE_VSLIimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> { let Inst{28} = 0b1; let Inst{21-20} = 0b01; } def MVE_VSLIimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> { let Inst{28} = 0b0; let Inst{21} = 0b1; } def MVE_VSLIimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> { let Inst{28} = 0b1; let Inst{21} = 0b1; } class MVE_VQSHLU_imm : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; let Inst{28} = 0b1; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b110; } def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> { let Inst{21-19} = 0b001; } def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> { let Inst{21-20} = 0b01; } def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> { let Inst{21} = 0b1; } class MVE_VRSHR_imm : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b010; } def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> { let Inst{28} = 0b0; let Inst{21-19} = 0b001; } def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> { let Inst{28} = 0b1; let Inst{21-19} = 0b001; } def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> { let Inst{28} = 0b0; let Inst{21-20} = 0b01; } def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> { let Inst{28} = 0b1; let Inst{21-20} = 0b01; } def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> { let Inst{28} = 0b0; let Inst{21} = 0b1; } def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> { let Inst{28} = 0b1; let Inst{21} = 0b1; } class MVE_VSHR_imm : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b000; } def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> { let Inst{28} = 0b0; let Inst{21-19} = 0b001; } def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> { let Inst{28} = 0b1; let Inst{21-19} = 0b001; } def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> { let Inst{28} = 0b0; let Inst{21-20} = 0b01; } def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> { let Inst{28} = 0b1; let Inst{21-20} = 0b01; } def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> { let Inst{28} = 0b0; let Inst{21} = 0b1; } def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> { let Inst{28} = 0b1; let Inst{21} = 0b1; } class MVE_VSHL_imm : MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd), !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm", vpred_r, ""> { bits<6> imm; let Inst{28} = 0b0; let Inst{25-24} = 0b11; let Inst{21-16} = imm; let Inst{10-8} = 0b101; } def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> { let Inst{21-19} = 0b001; } def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> { let Inst{21-20} = 0b01; } def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> { let Inst{21} = 0b1; } let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)), (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>; def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)), (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>; def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)), (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>; def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)), (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>; def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)), (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>; def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)), (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>; def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)), (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>; def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)), (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>; def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)), (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>; } // end of mve_shift instructions // start of MVE Floating Point instructions class MVE_float pattern=[]> : MVE_f { bits<4> Qm; let Inst{12} = 0b0; let Inst{6} = 0b1; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; } class MVE_VRINT op, string suffix, bits<2> size, list pattern=[]> : MVE_float { bits<4> Qd; let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b10; let Inst{15-13} = Qd{2-0}; let Inst{11-10} = 0b01; let Inst{9-7} = op{2-0}; let Inst{4} = 0b0; let validForTailPredication = 1; } multiclass MVE_VRINT_ops size, list pattern=[]> { def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>; def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>; def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>; def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>; def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>; def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>; } defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>; defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>; let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))), (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>; def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))), (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>; def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))), (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>; def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))), (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>; def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))), (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>; def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))), (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>; def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))), (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>; def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))), (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>; def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))), (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>; def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))), (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>; } class MVEFloatArithNeon pattern=[]> : MVE_float { let Inst{20} = size; let Inst{16} = 0b0; } class MVE_VMUL_fp pattern=[]> : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "", pattern> { bits<4> Qd; bits<4> Qn; let Inst{28} = 0b1; let Inst{25-23} = 0b110; let Inst{22} = Qd{3}; let Inst{21} = 0b0; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{12-8} = 0b01101; let Inst{7} = Qn{3}; let Inst{4} = 0b1; let validForTailPredication = 1; } def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>; let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } class MVE_VCMLA pattern=[]> : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", pattern> { bits<4> Qd; bits<4> Qn; bits<2> rot; let Inst{28} = 0b1; let Inst{25} = 0b0; let Inst{24-23} = rot; let Inst{22} = Qd{3}; let Inst{21} = 0b1; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{12-8} = 0b01000; let Inst{7} = Qn{3}; let Inst{4} = 0b0; } def MVE_VCMLAf16 : MVE_VCMLA<"f16", 0b0>; def MVE_VCMLAf32 : MVE_VCMLA<"f32", 0b1>; class MVE_VADDSUBFMA_fp pattern=[]> : MVEFloatArithNeon { bits<4> Qd; bits<4> Qn; let Inst{28} = 0b0; let Inst{25-23} = 0b110; let Inst{22} = Qd{3}; let Inst{21} = bit_21; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{11-9} = 0b110; let Inst{8} = bit_8; let Inst{7} = Qn{3}; let Inst{4} = bit_4; } def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; let Predicates = [HasMVEFloat, UseFusedMAC] in { def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), (fmul (v8f16 MQPR:$src2), (v8f16 MQPR:$src3)))), (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), (fmul (v4f32 MQPR:$src2), (v4f32 MQPR:$src3)))), (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), (fmul (v8f16 MQPR:$src2), (v8f16 MQPR:$src3)))), (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), (fmul (v4f32 MQPR:$src2), (v4f32 MQPR:$src3)))), (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; } let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; } multiclass MVE_VADDSUB_fp_m { def "" : MVE_VADDSUBFMA_fp { let validForTailPredication = 1; } let Predicates = [HasMVEFloat] in { def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), (VTI.Vec (!cast(NAME) (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 1), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))>; } } multiclass MVE_VADD_fp_m : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>; multiclass MVE_VSUB_fp_m : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>; defm MVE_VADDf32 : MVE_VADD_fp_m; defm MVE_VADDf16 : MVE_VADD_fp_m; defm MVE_VSUBf32 : MVE_VSUB_fp_m; defm MVE_VSUBf16 : MVE_VSUB_fp_m; class MVE_VCADD pattern=[]> : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bit rot; let Inst{28} = 0b1; let Inst{25} = 0b0; let Inst{24} = rot; let Inst{23} = 0b1; let Inst{22} = Qd{3}; let Inst{21} = 0b0; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{12-8} = 0b01000; let Inst{7} = Qn{3}; let Inst{4} = 0b0; } def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>; def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">; class MVE_VABD_fp : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, ""> { bits<4> Qd; bits<4> Qn; let Inst{28} = 0b1; let Inst{25-23} = 0b110; let Inst{22} = Qd{3}; let Inst{21} = 0b1; let Inst{20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{15-13} = Qd{2-0}; let Inst{11-8} = 0b1101; let Inst{7} = Qn{3}; let Inst{4} = 0b0; let validForTailPredication = 1; } def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>; def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>; class MVE_VCVT_fix pattern=[]> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6), "$Qd, $Qm, $imm6", vpred_r, "", pattern> { bits<4> Qd; bits<6> imm6; let Inst{28} = U; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21} = 0b1; let Inst{19-16} = imm6{3-0}; let Inst{15-13} = Qd{2-0}; let Inst{11-10} = 0b11; let Inst{9} = fsi; let Inst{8} = op; let Inst{7} = 0b0; let Inst{4} = 0b1; let DecoderMethod = "DecodeMVEVCVTt1fp"; let validForTailPredication = 1; } class MVE_VCVT_imm_asmop : AsmOperandClass { let PredicateMethod = "isImmediate<1," # Bits # ">"; let DiagnosticString = "MVE fixed-point immediate operand must be between 1 and " # Bits; let Name = "MVEVcvtImm" # Bits; let RenderMethod = "addImmOperands"; } class MVE_VCVT_imm: Operand { let ParserMatchClass = MVE_VCVT_imm_asmop; let EncoderMethod = "getNEONVcvtImm32OpValue"; let DecoderMethod = "DecodeVCVTImmOperand"; } class MVE_VCVT_fix_f32 : MVE_VCVT_fix> { let Inst{20} = imm6{4}; } class MVE_VCVT_fix_f16 : MVE_VCVT_fix> { let Inst{20} = 0b1; } def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>; def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>; def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>; def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>; def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>; def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>; def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>; def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>; class MVE_VCVT_fp_int_anpm size, bit op, string anpm, bits<2> rm, list pattern=[]> : MVE_float { bits<4> Qd; let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b11; let Inst{15-13} = Qd{2-0}; let Inst{12-10} = 0b000; let Inst{9-8} = rm; let Inst{7} = op; let Inst{4} = 0b0; let validForTailPredication = 1; } multiclass MVE_VCVT_fp_int_anpm_multi size, bit op, list pattern=[]> { def a : MVE_VCVT_fp_int_anpm; def n : MVE_VCVT_fp_int_anpm; def p : MVE_VCVT_fp_int_anpm; def m : MVE_VCVT_fp_int_anpm; } // This defines instructions such as MVE_VCVTu16f16a, with an explicit // rounding-mode suffix on the mnemonic. The class below will define // the bare MVE_VCVTu16f16 (with implied rounding toward zero). defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>; defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>; defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>; defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>; class MVE_VCVT_fp_int size, bits<2> op, list pattern=[]> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> { bits<4> Qd; let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b11; let Inst{15-13} = Qd{2-0}; let Inst{12-9} = 0b0011; let Inst{8-7} = op; let Inst{4} = 0b0; let validForTailPredication = 1; } // The unsuffixed VCVT for float->int implicitly rounds toward zero, // which I reflect here in the llvm instruction names def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>; def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>; def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>; def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>; // Whereas VCVT for int->float rounds to nearest def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>; def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>; def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>; def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>; let Predicates = [HasMVEFloat] in { def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))), (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>; def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))), (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>; def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))), (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>; def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))), (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>; def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))), (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>; def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))), (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>; def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))), (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>; def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))), (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>; } class MVE_VABSNEG_fp size, bit negate, list pattern=[]> : MVE_float { bits<4> Qd; let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17-16} = 0b01; let Inst{15-13} = Qd{2-0}; let Inst{11-8} = 0b0111; let Inst{7} = negate; let Inst{4} = 0b0; let validForTailPredication = 1; } def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>; let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fabs MQPR:$src)), (MVE_VABSf16 MQPR:$src)>; def : Pat<(v4f32 (fabs MQPR:$src)), (MVE_VABSf32 MQPR:$src)>; } def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>; def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>; let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fneg MQPR:$src)), (MVE_VNEGf16 MQPR:$src)>; def : Pat<(v4f32 (fneg MQPR:$src)), (MVE_VNEGf32 MQPR:$src)>; } class MVE_VMAXMINNMA pattern=[]> : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> { bits<4> Qd; bits<4> Qm; let Inst{28} = size; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{21-16} = 0b111111; let Inst{15-13} = Qd{2-0}; let Inst{12} = bit_12; let Inst{11-6} = 0b111010; let Inst{5} = Qm{3}; let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; } def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>; def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>; def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>; def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>; // end of MVE Floating Point instructions // start of MVE compares class MVE_VCMPqq bits_21_20, VCMPPredicateOperand predtype, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc), NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> { // Base class for comparing two vector registers bits<3> fc; bits<4> Qn; bits<4> Qm; let Inst{28} = bit_28; let Inst{25-22} = 0b1000; let Inst{21-20} = bits_21_20; let Inst{19-17} = Qn{2-0}; let Inst{16-13} = 0b1000; let Inst{12} = fc{2}; let Inst{11-8} = 0b1111; let Inst{7} = fc{0}; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; let Constraints = ""; // We need a custom decoder method for these instructions because of // the output VCCR operand, which isn't encoded in the instruction // bits anywhere (there is only one choice for it) but has to be // included in the MC operands so that codegen will be able to track // its data flow between instructions, spill/reload it when // necessary, etc. There seems to be no way to get the Tablegen // decoder to emit an operand that isn't affected by any instruction // bit. let DecoderMethod = "DecodeMVEVCMP"; let validForTailPredication = 1; } class MVE_VCMPqqf : MVE_VCMPqq { let Predicates = [HasMVEFloat]; } class MVE_VCMPqqi size> : MVE_VCMPqq { let Inst{12} = 0b0; let Inst{0} = 0b0; } class MVE_VCMPqqu size> : MVE_VCMPqq { let Inst{12} = 0b0; let Inst{0} = 0b1; } class MVE_VCMPqqs size> : MVE_VCMPqq { let Inst{12} = 0b1; } def MVE_VCMPf32 : MVE_VCMPqqf<"f32", 0b0>; def MVE_VCMPf16 : MVE_VCMPqqf<"f16", 0b1>; def MVE_VCMPi8 : MVE_VCMPqqi<"i8", 0b00>; def MVE_VCMPi16 : MVE_VCMPqqi<"i16", 0b01>; def MVE_VCMPi32 : MVE_VCMPqqi<"i32", 0b10>; def MVE_VCMPu8 : MVE_VCMPqqu<"u8", 0b00>; def MVE_VCMPu16 : MVE_VCMPqqu<"u16", 0b01>; def MVE_VCMPu32 : MVE_VCMPqqu<"u32", 0b10>; def MVE_VCMPs8 : MVE_VCMPqqs<"s8", 0b00>; def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>; def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>; class MVE_VCMPqr bits_21_20, VCMPPredicateOperand predtype, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc), NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> { // Base class for comparing a vector register with a scalar bits<3> fc; bits<4> Qn; bits<4> Rm; let Inst{28} = bit_28; let Inst{25-22} = 0b1000; let Inst{21-20} = bits_21_20; let Inst{19-17} = Qn{2-0}; let Inst{16-13} = 0b1000; let Inst{12} = fc{2}; let Inst{11-8} = 0b1111; let Inst{7} = fc{0}; let Inst{6} = 0b1; let Inst{5} = fc{1}; let Inst{4} = 0b0; let Inst{3-0} = Rm{3-0}; let Constraints = ""; // Custom decoder method, for the same reason as MVE_VCMPqq let DecoderMethod = "DecodeMVEVCMP"; let validForTailPredication = 1; } class MVE_VCMPqrf : MVE_VCMPqr { let Predicates = [HasMVEFloat]; } class MVE_VCMPqri size> : MVE_VCMPqr { let Inst{12} = 0b0; let Inst{5} = 0b0; } class MVE_VCMPqru size> : MVE_VCMPqr { let Inst{12} = 0b0; let Inst{5} = 0b1; } class MVE_VCMPqrs size> : MVE_VCMPqr { let Inst{12} = 0b1; } def MVE_VCMPf32r : MVE_VCMPqrf<"f32", 0b0>; def MVE_VCMPf16r : MVE_VCMPqrf<"f16", 0b1>; def MVE_VCMPi8r : MVE_VCMPqri<"i8", 0b00>; def MVE_VCMPi16r : MVE_VCMPqri<"i16", 0b01>; def MVE_VCMPi32r : MVE_VCMPqri<"i32", 0b10>; def MVE_VCMPu8r : MVE_VCMPqru<"u8", 0b00>; def MVE_VCMPu16r : MVE_VCMPqru<"u16", 0b01>; def MVE_VCMPu32r : MVE_VCMPqru<"u32", 0b10>; def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>; def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>; def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>; multiclass unpred_vcmp_z { def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))), (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>; def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))), (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>; def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))), (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))), (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))), (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))), (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; } multiclass unpred_vcmp_r { def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))), (v16i1 (!cast("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>; def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))), (v8i1 (!cast("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>; def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))), (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))), (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))), (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))), (v16i1 (!cast("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))), (v8i1 (!cast("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))), (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))), (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))), (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; } multiclass unpred_vcmpf_z { def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>; def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))), (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; } multiclass unpred_vcmpf_r { def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>; defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>; defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>; defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>; defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>; defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>; defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>; defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>; defm MVE_VCEQ : unpred_vcmp_r<"i", 0>; defm MVE_VCNE : unpred_vcmp_r<"i", 1>; defm MVE_VCGE : unpred_vcmp_r<"s", 10>; defm MVE_VCLT : unpred_vcmp_r<"s", 11>; defm MVE_VCGT : unpred_vcmp_r<"s", 12>; defm MVE_VCLE : unpred_vcmp_r<"s", 13>; defm MVE_VCGTU : unpred_vcmp_r<"u", 8>; defm MVE_VCGEU : unpred_vcmp_r<"u", 2>; } let Predicates = [HasMVEFloat] in { defm MVE_VFCEQZ : unpred_vcmpf_z<0>; defm MVE_VFCNEZ : unpred_vcmpf_z<1>; defm MVE_VFCGEZ : unpred_vcmpf_z<10>; defm MVE_VFCLTZ : unpred_vcmpf_z<11>; defm MVE_VFCGTZ : unpred_vcmpf_z<12>; defm MVE_VFCLEZ : unpred_vcmpf_z<13>; defm MVE_VFCEQ : unpred_vcmpf_r<0>; defm MVE_VFCNE : unpred_vcmpf_r<1>; defm MVE_VFCGE : unpred_vcmpf_r<10>; defm MVE_VFCLT : unpred_vcmpf_r<11>; defm MVE_VFCGT : unpred_vcmpf_r<12>; defm MVE_VFCLE : unpred_vcmpf_r<13>; } // Extra "worst case" and/or/xor partterns, going into and out of GRP multiclass two_predops { def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))), (v16i1 (COPY_TO_REGCLASS (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)), (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))), VCCR))>; def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))), (v8i1 (COPY_TO_REGCLASS (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)), (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))), VCCR))>; def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))), (v4i1 (COPY_TO_REGCLASS (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)), (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))), VCCR))>; } let Predicates = [HasMVEInt] in { defm POR : two_predops; defm PAND : two_predops; defm PEOR : two_predops; } // Occasionally we need to cast between a i32 and a boolean vector, for // example when moving between rGPR and VPR.P0 as part of predicate vector // shuffles. We also sometimes need to cast between different predicate // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; let Predicates = [HasMVEInt] in { foreach VT = [ v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; def : Pat<(VT (predicate_cast (i32 VCCR:$src))), (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; foreach VT2 = [ v4i1, v8i1, v16i1 ] in def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } } // end of MVE compares // start of MVE_qDest_qSrc class MVE_qDest_qSrc pattern=[]> : MVE_p { bits<4> Qd; bits<4> Qm; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{15-13} = Qd{2-0}; let Inst{11-9} = 0b111; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; } class MVE_VQxDMLxDH size, string cstr="", list pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; let Inst{28} = subtract; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12} = exch; let Inst{8} = 0b0; let Inst{7} = Qn{3}; let Inst{0} = round; } multiclass MVE_VQxDMLxDH_multi { def s8 : MVE_VQxDMLxDH; def s16 : MVE_VQxDMLxDH; def s32 : MVE_VQxDMLxDH; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; defm MVE_VQDMLADHX : MVE_VQxDMLxDH_multi<"vqdmladhx", 0b1, 0b0, 0b0>; defm MVE_VQRDMLADH : MVE_VQxDMLxDH_multi<"vqrdmladh", 0b0, 0b1, 0b0>; defm MVE_VQRDMLADHX : MVE_VQxDMLxDH_multi<"vqrdmladhx", 0b1, 0b1, 0b0>; defm MVE_VQDMLSDH : MVE_VQxDMLxDH_multi<"vqdmlsdh", 0b0, 0b0, 0b1>; defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>; defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>; defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>; class MVE_VCMUL pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; bits<2> rot; let Inst{28} = size; let Inst{21-20} = 0b11; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12} = rot{1}; let Inst{8} = 0b0; let Inst{7} = Qn{3}; let Inst{0} = rot{0}; let Predicates = [HasMVEFloat]; } def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>; def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">; class MVE_VMULL bits_21_20, bit T, string cstr, list pattern=[]> : MVE_qDest_qSrc { bits<4> Qd; bits<4> Qn; bits<4> Qm; let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b1; let Inst{12} = T; let Inst{8} = 0b0; let Inst{7} = Qn{3}; let Inst{0} = 0b0; } multiclass MVE_VMULL_multi bits_21_20, string cstr=""> { def bh : MVE_VMULL; def th : MVE_VMULL; } // For integer multiplies, bits 21:20 encode size, and bit 28 signedness. // For polynomial multiplies, bits 21:20 take the unused value 0b11, and // bit 28 switches to encoding the size. defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>; defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>; defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>; defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>; defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>; defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>; class MVE_VxMULH size, bit round, list pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; let Inst{28} = U; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b1; let Inst{12} = round; let Inst{8} = 0b0; let Inst{7} = Qn{3}; let Inst{0} = 0b1; } def MVE_VMULHs8 : MVE_VxMULH<"vmulh", "s8", 0b0, 0b00, 0b0>; def MVE_VMULHs16 : MVE_VxMULH<"vmulh", "s16", 0b0, 0b01, 0b0>; def MVE_VMULHs32 : MVE_VxMULH<"vmulh", "s32", 0b0, 0b10, 0b0>; def MVE_VMULHu8 : MVE_VxMULH<"vmulh", "u8", 0b1, 0b00, 0b0>; def MVE_VMULHu16 : MVE_VxMULH<"vmulh", "u16", 0b1, 0b01, 0b0>; def MVE_VMULHu32 : MVE_VxMULH<"vmulh", "u32", 0b1, 0b10, 0b0>; def MVE_VRMULHs8 : MVE_VxMULH<"vrmulh", "s8", 0b0, 0b00, 0b1>; def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>; def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>; def MVE_VRMULHu8 : MVE_VxMULH<"vrmulh", "u8", 0b1, 0b00, 0b1>; def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>; def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>; class MVE_VxMOVxN size, bit T, list pattern=[]> : MVE_qDest_qSrc { let Inst{28} = bit_28; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17} = bit_17; let Inst{16} = 0b1; let Inst{12} = T; let Inst{8} = 0b0; let Inst{7} = !if(!eq(bit_17, 0), 1, 0); let Inst{0} = 0b1; } multiclass MVE_VxMOVxN_halves size> { def bh : MVE_VxMOVxN; def th : MVE_VxMOVxN; } defm MVE_VMOVNi16 : MVE_VxMOVxN_halves<"vmovn", "i16", 0b1, 0b0, 0b00>; defm MVE_VMOVNi32 : MVE_VxMOVxN_halves<"vmovn", "i32", 0b1, 0b0, 0b01>; defm MVE_VQMOVNs16 : MVE_VxMOVxN_halves<"vqmovn", "s16", 0b0, 0b1, 0b00>; defm MVE_VQMOVNs32 : MVE_VxMOVxN_halves<"vqmovn", "s32", 0b0, 0b1, 0b01>; defm MVE_VQMOVNu16 : MVE_VxMOVxN_halves<"vqmovn", "u16", 0b1, 0b1, 0b00>; defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>; defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; let Predicates = [HasMVEInt] in { def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; } class MVE_VCVT_ff pattern=[]> : MVE_qDest_qSrc { let Inst{28} = op; let Inst{21-16} = 0b111111; let Inst{12} = T; let Inst{8-7} = 0b00; let Inst{0} = 0b1; let Predicates = [HasMVEFloat]; } multiclass MVE_VCVT_f2h_m { def "": MVE_VCVT_ff; let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (int_arm_mve_vcvt_narrow (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), (v8f16 (!cast(NAME) (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half), (v4i1 VCCR:$mask))), (v8f16 (!cast(NAME) (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 1), (v4i1 VCCR:$mask)))>; } } multiclass MVE_VCVT_h2f_m { def "": MVE_VCVT_ff; } defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>; defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>; defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>; class MVE_VxCADD size, bit halve, string cstr="", list pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; bit rot; let Inst{28} = halve; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12} = rot; let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{0} = 0b0; } def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>; def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>; def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">; def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>; def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>; def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">; class MVE_VADCSBC pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; let Inst{28} = subtract; let Inst{21-20} = 0b11; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12} = I; let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{0} = 0b0; // Custom decoder method in order to add the FPSCR operand(s), which // Tablegen won't do right let DecoderMethod = "DecodeMVEVADCInstruction"; } def MVE_VADC : MVE_VADCSBC<"vadc", 0b0, 0b0, (ins cl_FPSCR_NZCV:$carryin)>; def MVE_VADCI : MVE_VADCSBC<"vadci", 0b1, 0b0, (ins)>; def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>; def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>; class MVE_VQDMULL pattern=[]> : MVE_qDest_qSrc { bits<4> Qn; let Inst{28} = size; let Inst{21-20} = 0b11; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b0; let Inst{12} = T; let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{0} = 0b1; } multiclass MVE_VQDMULL_halves { def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>; def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">; // end of mve_qDest_qSrc // start of mve_qDest_rSrc class MVE_qr_base pattern=[]> : MVE_p { bits<4> Qd; bits<4> Qn; bits<4> Rm; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{19-17} = Qn{2-0}; let Inst{15-13} = Qd{2-0}; let Inst{11-9} = 0b111; let Inst{7} = Qn{3}; let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{3-0} = Rm{3-0}; } class MVE_qDest_rSrc pattern=[]> : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm), NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr, pattern>; class MVE_qDestSrc_rSrc pattern=[]> : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm), NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src", pattern>; class MVE_qDest_single_rSrc pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname, suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> { bits<4> Qd; bits<4> Rm; let Inst{22} = Qd{3}; let Inst{15-13} = Qd{2-0}; let Inst{3-0} = Rm{3-0}; } class MVE_VADDSUB_qr size, bit bit_5, bit bit_12, bit bit_16, bit bit_28, list pattern=[]> : MVE_qDest_rSrc { let Inst{28} = bit_28; let Inst{21-20} = size; let Inst{16} = bit_16; let Inst{12} = bit_12; let Inst{8} = 0b1; let Inst{5} = bit_5; let validForTailPredication = 1; } multiclass MVE_VADDSUB_qr_sizes pattern=[]> { def "8" : MVE_VADDSUB_qr; def "16" : MVE_VADDSUB_qr; def "32" : MVE_VADDSUB_qr; } defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>; defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>; defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>; defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; } let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; } class MVE_VQDMULL_qr pattern=[]> : MVE_qDest_rSrc { let Inst{28} = size; let Inst{21-20} = 0b11; let Inst{16} = 0b0; let Inst{12} = T; let Inst{8} = 0b1; let Inst{5} = 0b1; } multiclass MVE_VQDMULL_qr_halves { def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>; def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">; class MVE_VxADDSUB_qr bits_21_20, bit subtract, list pattern=[]> : MVE_qDest_rSrc { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; let Inst{16} = 0b0; let Inst{12} = subtract; let Inst{8} = 0b1; let Inst{5} = 0b0; let validForTailPredication = 1; } def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>; def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>; def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>; def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>; def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>; def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>; def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>; def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>; def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>; def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>; def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>; let Predicates = [HasMVEFloat] in { def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>; def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd", "f16", 0b1, 0b11, 0b0>; def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub", "f32", 0b0, 0b11, 0b1>; def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>; } class MVE_VxSHL_qr size, bit bit_7, bit bit_17, list pattern=[]> : MVE_qDest_single_rSrc { let Inst{28} = U; let Inst{25-23} = 0b100; let Inst{21-20} = 0b11; let Inst{19-18} = size; let Inst{17} = bit_17; let Inst{16} = 0b1; let Inst{12-8} = 0b11110; let Inst{7} = bit_7; let Inst{6-4} = 0b110; let validForTailPredication = 1; } multiclass MVE_VxSHL_qr_types { def s8 : MVE_VxSHL_qr; def s16 : MVE_VxSHL_qr; def s32 : MVE_VxSHL_qr; def u8 : MVE_VxSHL_qr; def u16 : MVE_VxSHL_qr; def u32 : MVE_VxSHL_qr; } defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>; defm MVE_VRSHL_qr : MVE_VxSHL_qr_types<"vrshl", 0b0, 0b1>; defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; } class MVE_VBRSR size, list pattern=[]> : MVE_qDest_rSrc { let Inst{28} = 0b1; let Inst{21-20} = size; let Inst{16} = 0b1; let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; let validForTailPredication = 1; } def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))), (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>; def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))), (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>; def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))), (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; } class MVE_VMUL_qr_int size, list pattern=[]> : MVE_qDest_rSrc { let Inst{28} = 0b0; let Inst{21-20} = size; let Inst{16} = 0b1; let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; let validForTailPredication = 1; } def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; } class MVE_VxxMUL_qr bits_21_20, list pattern=[]> : MVE_qDest_rSrc { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; let Inst{16} = 0b1; let Inst{12} = 0b0; let Inst{8} = 0b0; let Inst{5} = 0b1; } def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>; def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>; def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>; def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } class MVE_VFMAMLA_qr bits_21_20, bit S, list pattern=[]> : MVE_qDestSrc_rSrc { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; let Inst{16} = 0b1; let Inst{12} = S; let Inst{8} = 0b0; let Inst{5} = 0b0; let validForTailPredication = 1; } def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>; def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>; def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>; def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>; def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>; def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>; def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>; def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>; def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (add (v4i32 MQPR:$src1), (v4i32 (mul (v4i32 MQPR:$src2), (v4i32 (ARMvdup (i32 rGPR:$x))))))), (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; def : Pat<(v8i16 (add (v8i16 MQPR:$src1), (v8i16 (mul (v8i16 MQPR:$src2), (v8i16 (ARMvdup (i32 rGPR:$x))))))), (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; def : Pat<(v16i8 (add (v16i8 MQPR:$src1), (v16i8 (mul (v16i8 MQPR:$src2), (v16i8 (ARMvdup (i32 rGPR:$x))))))), (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; } let Predicates = [HasMVEFloat] in { def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>; def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>; } class MVE_VQDMLAH_qr size, bit bit_5, bit bit_12, list pattern=[]> : MVE_qDestSrc_rSrc { let Inst{28} = U; let Inst{21-20} = size; let Inst{16} = 0b0; let Inst{12} = bit_12; let Inst{8} = 0b0; let Inst{5} = bit_5; } multiclass MVE_VQDMLAH_qr_types { def s8 : MVE_VQDMLAH_qr; def s16 : MVE_VQDMLAH_qr; def s32 : MVE_VQDMLAH_qr; } defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>; defm MVE_VQRDMLAH_qr : MVE_VQDMLAH_qr_types<"vqrdmlah", 0b0, 0b0>; defm MVE_VQDMLASH_qr : MVE_VQDMLAH_qr_types<"vqdmlash", 0b1, 0b1>; defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>; class MVE_VxDUP size, bit bit_12, list pattern=[]> : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn), (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary, iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src", pattern> { bits<4> Qd; bits<4> Rn; bits<2> imm; let Inst{28} = 0b0; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{21-20} = size; let Inst{19-17} = Rn{3-1}; let Inst{16} = 0b1; let Inst{15-13} = Qd{2-0}; let Inst{12} = bit_12; let Inst{11-8} = 0b1111; let Inst{7} = imm{1}; let Inst{6-1} = 0b110111; let Inst{0} = imm{0}; let validForTailPredication = 1; } def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>; def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>; def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>; def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>; def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>; class MVE_VxWDUP size, bit bit_12, list pattern=[]> : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn), (ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary, iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src", pattern> { bits<4> Qd; bits<4> Rm; bits<4> Rn; bits<2> imm; let Inst{28} = 0b0; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{21-20} = size; let Inst{19-17} = Rn{3-1}; let Inst{16} = 0b1; let Inst{15-13} = Qd{2-0}; let Inst{12} = bit_12; let Inst{11-8} = 0b1111; let Inst{7} = imm{1}; let Inst{6-4} = 0b110; let Inst{3-1} = Rm{3-1}; let Inst{0} = imm{0}; let validForTailPredication = 1; } def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; def MVE_VIWDUPu16 : MVE_VxWDUP<"viwdup", "u16", 0b01, 0b0>; def MVE_VIWDUPu32 : MVE_VxWDUP<"viwdup", "u32", 0b10, 0b0>; def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>; def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; let hasSideEffects = 1 in class MVE_VCTP size, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { bits<4> Rn; let Inst{28-27} = 0b10; let Inst{26-22} = 0b00000; let Inst{21-20} = size; let Inst{19-16} = Rn{3-0}; let Inst{15-11} = 0b11101; let Inst{10-0} = 0b00000000001; let Unpredictable{10-0} = 0b11111111111; let Constraints = ""; let DecoderMethod = "DecodeMveVCTP"; let validForTailPredication = 1; } def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; let Predicates = [HasMVEInt] in { def : Pat<(int_arm_vctp8 rGPR:$Rn), (v16i1 (MVE_VCTP8 rGPR:$Rn))>; def : Pat<(int_arm_vctp16 rGPR:$Rn), (v8i1 (MVE_VCTP16 rGPR:$Rn))>; def : Pat<(int_arm_vctp32 rGPR:$Rn), (v4i1 (MVE_VCTP32 rGPR:$Rn))>; } // end of mve_qDest_rSrc // start of coproc mov class MVE_VMOV_64bit : MVE_VMOV_lane_base { bits<5> Rt; bits<5> Rt2; bits<4> Qd; bit idx; bit idx2; let Inst{31-23} = 0b111011000; let Inst{22} = Qd{3}; let Inst{21} = 0b0; let Inst{20} = to_qreg; let Inst{19-16} = Rt2{3-0}; let Inst{15-13} = Qd{2-0}; let Inst{12-5} = 0b01111000; let Inst{4} = idx2; let Inst{3-0} = Rt{3-0}; } // The assembly syntax for these instructions mentions the vector // register name twice, e.g. // // vmov q2[2], q2[0], r0, r1 // vmov r0, r1, q2[2], q2[0] // // which needs a bit of juggling with MC operand handling. // // For the move _into_ a vector register, the MC operand list also has // to mention the register name twice: once as the output, and once as // an extra input to represent where the unchanged half of the output // register comes from (when this instruction is used in code // generation). So we arrange that the first mention of the vector reg // in the instruction is considered by the AsmMatcher to be the output // ($Qd), and the second one is the input ($QdSrc). Binding them // together with the existing 'tie' constraint is enough to enforce at // register allocation time that they have to be the same register. // // For the move _from_ a vector register, there's no way to get round // the fact that both instances of that register name have to be // inputs. They have to be the same register again, but this time, we // can't use a tie constraint, because that has to be between an // output and an input operand. So this time, we have to arrange that // the q-reg appears just once in the MC operand list, in spite of // being mentioned twice in the asm syntax - which needs a custom // AsmMatchConverter. def MVE_VMOV_q_rr : MVE_VMOV_64bit<(outs MQPR:$Qd), (ins MQPR:$QdSrc, rGPR:$Rt, rGPR:$Rt2), 0b1, "$Qd$idx, $QdSrc$idx2, $Rt, $Rt2", "$Qd = $QdSrc"> { let DecoderMethod = "DecodeMVEVMOVDRegtoQ"; } def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd), 0b0, "$Rt, $Rt2, $Qd$idx, $Qd$idx2", ""> { let DecoderMethod = "DecodeMVEVMOVQtoDReg"; let AsmMatchConverter = "cvtMVEVMOVQtoDReg"; } // end of coproc mov // start of MVE interleaving load/store // Base class for the family of interleaving/deinterleaving // load/stores with names like VLD20.8 and VST43.32. class MVE_vldst24_base stage, bits<2> size, bit load, dag Oops, dag loadIops, dag wbIops, string iname, string ops, string cstr, list pattern=[]> : MVE_MI { bits<4> VQd; bits<4> Rn; let Inst{31-22} = 0b1111110010; let Inst{21} = writeback; let Inst{20} = load; let Inst{19-16} = Rn; let Inst{15-13} = VQd{2-0}; let Inst{12-9} = 0b1111; let Inst{8-7} = size; let Inst{6-5} = stage; let Inst{4-1} = 0b0000; let Inst{0} = fourregs; let mayLoad = load; let mayStore = !eq(load,0); } // A parameter class used to encapsulate all the ways the writeback // variants of VLD20 and friends differ from the non-writeback ones. class MVE_vldst24_writeback { bit writeback = b; dag Oops = Oo; dag Iops = Io; string syntax = sy; string cstr = c; string id_suffix = n; } // Another parameter class that encapsulates the differences between VLD2x // and VLD4x. class MVE_vldst24_nvecs s, bit b, RegisterOperand vl> { int nvecs = n; list stages = s; bit bit0 = b; RegisterOperand VecList = vl; } // A third parameter class that distinguishes VLDnn.8 from .16 from .32. class MVE_vldst24_lanesize b> { int lanesize = i; bits<2> sizebits = b; } // A base class for each direction of transfer: one for load, one for // store. I can't make these a fourth independent parametric tuple // class, because they have to take the nvecs tuple class as a // parameter, in order to find the right VecList operand type. class MVE_vld24_base pat, bits<2> size, MVE_vldst24_writeback wb, string iname, list pattern=[]> : MVE_vldst24_base; class MVE_vst24_base pat, bits<2> size, MVE_vldst24_writeback wb, string iname, list pattern=[]> : MVE_vldst24_base; // Actually define all the interleaving loads and stores, by a series // of nested foreaches over number of vectors (VLD2/VLD4); stage // within one of those series (VLDx0/VLDx1/VLDx2/VLDx3); size of // vector lane; writeback or no writeback. foreach n = [MVE_vldst24_nvecs<2, [0,1], 0, VecList2Q>, MVE_vldst24_nvecs<4, [0,1,2,3], 1, VecList4Q>] in foreach stage = n.stages in foreach s = [MVE_vldst24_lanesize< 8, 0b00>, MVE_vldst24_lanesize<16, 0b01>, MVE_vldst24_lanesize<32, 0b10>] in foreach wb = [MVE_vldst24_writeback< 1, (outs rGPR:$wb), (ins t2_nosp_addr_offset_none:$Rn), "!", "$Rn.base = $wb", "_wb">, MVE_vldst24_writeback<0, (outs), (ins t2_addr_offset_none:$Rn)>] in { // For each case within all of those foreaches, define the actual // instructions. The def names are made by gluing together pieces // from all the parameter classes, and will end up being things like // MVE_VLD20_8 and MVE_VST43_16_wb. def "MVE_VLD" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix : MVE_vld24_base; def "MVE_VST" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix : MVE_vst24_base; } +multiclass MVE_vst24_patterns { + foreach stage = [0,1] in + def : Pat<(int_arm_mve_vst2q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (!cast("MVE_VST2"#stage#"_"#lanesize) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + + foreach stage = [0,1,2,3] in + def : Pat<(int_arm_mve_vst4q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (!cast("MVE_VST4"#stage#"_"#lanesize) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; +} +defm : MVE_vst24_patterns<8, v16i8>; +defm : MVE_vst24_patterns<16, v8i16>; +defm : MVE_vst24_patterns<32, v4i32>; +defm : MVE_vst24_patterns<16, v8f16>; +defm : MVE_vst24_patterns<32, v4f32>; + // end of MVE interleaving load/store // start of MVE predicable load/store // A parameter class for the direction of transfer. class MVE_ldst_direction { bit load = b; dag Oops = Oo; dag Iops = Io; string cstr = c; } def MVE_ld: MVE_ldst_direction<1, (outs MQPR:$Qd), (ins), ",@earlyclobber $Qd">; def MVE_st: MVE_ldst_direction<0, (outs), (ins MQPR:$Qd)>; // A parameter class for the size of memory access in a load. class MVE_memsz e, int s, AddrMode m, string mn, list types> { bits<2> encoding = e; // opcode bit(s) for encoding int shift = s; // shift applied to immediate load offset AddrMode AM = m; // For instruction aliases: define the complete list of type // suffixes at this size, and the canonical ones for loads and // stores. string MnemonicLetter = mn; int TypeBits = !shl(8, s); string CanonLoadSuffix = ".u" # TypeBits; string CanonStoreSuffix = "." # TypeBits; list suffixes = !foreach(letter, types, "." # letter # TypeBits); } // Instances of MVE_memsz. // // (memD doesn't need an AddrMode, because those are only for // contiguous loads, and memD is only used by gather/scatters.) def MVE_memB: MVE_memsz<0b00, 0, AddrModeT2_i7, "b", ["", "u", "s"]>; def MVE_memH: MVE_memsz<0b01, 1, AddrModeT2_i7s2, "h", ["", "u", "s", "f"]>; def MVE_memW: MVE_memsz<0b10, 2, AddrModeT2_i7s4, "w", ["", "u", "s", "f"]>; def MVE_memD: MVE_memsz<0b11, 3, ?, "d", ["", "u", "s", "f"]>; // This is the base class for all the MVE loads and stores other than // the interleaving ones. All the non-interleaving loads/stores share // the characteristic that they operate on just one vector register, // so they are VPT-predicable. // // The predication operand is vpred_n, for both loads and stores. For // store instructions, the reason is obvious: if there is no output // register, there can't be a need for an input parameter giving the // output register's previous value. Load instructions also don't need // that input parameter, because unlike MVE data processing // instructions, predicated loads are defined to set the inactive // lanes of the output register to zero, instead of preserving their // input values. class MVE_VLDRSTR_base pattern=[]> : MVE_p { bits<3> Qd; let Inst{28} = U; let Inst{25} = 0b0; let Inst{24} = P; let Inst{22} = 0b0; let Inst{21} = W; let Inst{20} = dir.load; let Inst{15-13} = Qd{2-0}; let Inst{12} = opc; let Inst{11-9} = 0b111; let mayLoad = dir.load; let mayStore = !eq(dir.load,0); let validForTailPredication = 1; } // Contiguous load and store instructions. These come in two main // categories: same-size loads/stores in which 128 bits of vector // register is transferred to or from 128 bits of memory in the most // obvious way, and widening loads / narrowing stores, in which the // size of memory accessed is less than the size of a vector register, // so the load instructions sign- or zero-extend each memory value // into a wider vector lane, and the store instructions truncate // correspondingly. // // The instruction mnemonics for these two classes look reasonably // similar, but the actual encodings are different enough to need two // separate base classes. // Contiguous, same size class MVE_VLDRSTR_cs : MVE_VLDRSTR_base { bits<12> addr; let Inst{23} = addr{7}; let Inst{19-16} = addr{11-8}; let Inst{8-7} = memsz.encoding; let Inst{6-0} = addr{6-0}; } // Contiguous, widening/narrowing class MVE_VLDRSTR_cw size, dag oops, dag iops, string asm, string suffix, IndexMode im, string ops, string cstr> : MVE_VLDRSTR_base { bits<11> addr; let Inst{23} = addr{7}; let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit let Inst{18-16} = addr{10-8}; let Inst{8-7} = size; let Inst{6-0} = addr{6-0}; let IM = im; } // Multiclass wrapper on each of the _cw and _cs base classes, to // generate three writeback modes (none, preindex, postindex). multiclass MVE_VLDRSTR_cw_m size> { let AM = memsz.AM in { def "" : MVE_VLDRSTR_cw< dir, memsz, U, 1, 0, size, dir.Oops, !con(dir.Iops, (ins taddrmode_imm7:$addr)), asm, suffix, IndexModeNone, "$Qd, $addr", "">; def _pre : MVE_VLDRSTR_cw< dir, memsz, U, 1, 1, size, !con((outs tGPR:$wb), dir.Oops), !con(dir.Iops, (ins taddrmode_imm7:$addr)), asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> { let DecoderMethod = "DecodeMVE_MEM_1_pre<"#memsz.shift#">"; } def _post : MVE_VLDRSTR_cw< dir, memsz, U, 0, 1, size, !con((outs tGPR:$wb), dir.Oops), !con(dir.Iops, (ins t_addr_offset_none:$Rn, t2am_imm7_offset:$addr)), asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> { bits<4> Rn; let Inst{18-16} = Rn{2-0}; } } } multiclass MVE_VLDRSTR_cs_m { let AM = memsz.AM in { def "" : MVE_VLDRSTR_cs< dir, memsz, 1, 0, dir.Oops, !con(dir.Iops, (ins t2addrmode_imm7:$addr)), asm, suffix, IndexModeNone, "$Qd, $addr", "">; def _pre : MVE_VLDRSTR_cs< dir, memsz, 1, 1, !con((outs rGPR:$wb), dir.Oops), !con(dir.Iops, (ins t2addrmode_imm7_pre:$addr)), asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> { let DecoderMethod = "DecodeMVE_MEM_2_pre<"#memsz.shift#">"; } def _post : MVE_VLDRSTR_cs< dir, memsz, 0, 1, !con((outs rGPR:$wb), dir.Oops), // We need an !if here to select the base register class, // because it's legal to write back to SP in a load of this // type, but not in a store. !con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none, t2_nosp_addr_offset_none):$Rn, t2am_imm7_offset:$addr)), asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> { bits<4> Rn; let Inst{19-16} = Rn{3-0}; } } } // Now actually declare all the contiguous load/stores, via those // multiclasses. The instruction ids coming out of this are the bare // names shown in the defm, with _pre or _post appended for writeback, // e.g. MVE_VLDRBS16, MVE_VSTRB16_pre, MVE_VSTRHU16_post. defm MVE_VLDRBS16: MVE_VLDRSTR_cw_m; defm MVE_VLDRBS32: MVE_VLDRSTR_cw_m; defm MVE_VLDRBU16: MVE_VLDRSTR_cw_m; defm MVE_VLDRBU32: MVE_VLDRSTR_cw_m; defm MVE_VLDRHS32: MVE_VLDRSTR_cw_m; defm MVE_VLDRHU32: MVE_VLDRSTR_cw_m; defm MVE_VLDRBU8: MVE_VLDRSTR_cs_m; defm MVE_VLDRHU16: MVE_VLDRSTR_cs_m; defm MVE_VLDRWU32: MVE_VLDRSTR_cs_m; defm MVE_VSTRB16: MVE_VLDRSTR_cw_m; defm MVE_VSTRB32: MVE_VLDRSTR_cw_m; defm MVE_VSTRH32: MVE_VLDRSTR_cw_m; defm MVE_VSTRBU8 : MVE_VLDRSTR_cs_m; defm MVE_VSTRHU16: MVE_VLDRSTR_cs_m; defm MVE_VSTRWU32: MVE_VLDRSTR_cs_m; // Gather loads / scatter stores whose address operand is of the form // [Rn,Qm], i.e. a single GPR as the common base address, plus a // vector of offset from it. ('Load/store this sequence of elements of // the same array.') // // Like the contiguous family, these loads and stores can widen the // loaded values / truncate the stored ones, or they can just // load/store the same size of memory and vector lane. But unlike the // contiguous family, there's no particular difference in encoding // between those two cases. // // This family also comes with the option to scale the offset values // in Qm by the size of the loaded memory (i.e. to treat them as array // indices), or not to scale them (to treat them as plain byte offsets // in memory, so that perhaps the loaded values are unaligned). The // scaled instructions' address operand in assembly looks like // [Rn,Qm,UXTW #2] or similar. // Base class. class MVE_VLDRSTR_rq size, bit os, string asm, string suffix, int shift> : MVE_VLDRSTR_base:$addr)), asm, suffix, "$Qd, $addr", dir.cstr> { bits<7> addr; let Inst{23} = 0b1; let Inst{19-16} = addr{6-3}; let Inst{8-7} = size; let Inst{6} = memsz.encoding{1}; let Inst{5} = 0; let Inst{4} = memsz.encoding{0}; let Inst{3-1} = addr{2-0}; let Inst{0} = os; } // Multiclass that defines the scaled and unscaled versions of an // instruction, when the memory size is wider than a byte. The scaled // version gets the default name like MVE_VLDRBU16_rq; the unscaled / // potentially unaligned version gets a "_u" suffix, e.g. // MVE_VLDRBU16_rq_u. multiclass MVE_VLDRSTR_rq_w size> { def _u : MVE_VLDRSTR_rq; def "" : MVE_VLDRSTR_rq; } // Subclass of MVE_VLDRSTR_rq with the same API as that multiclass, // for use when the memory size is one byte, so there's no 'scaled' // version of the instruction at all. (This is encoded as if it were // unscaled, but named in the default way with no _u suffix.) class MVE_VLDRSTR_rq_b size> : MVE_VLDRSTR_rq; // Actually define all the loads and stores in this family. def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b; def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b; def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b; def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b; def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b; defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w; defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w; defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w; defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w; defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w; def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b; def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b; def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b; defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w; defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w; defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w; defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w; // Gather loads / scatter stores whose address operand is of the form // [Qm,#imm], i.e. a vector containing a full base address for each // loaded item, plus an immediate offset applied consistently to all // of them. ('Load/store the same field from this vector of pointers // to a structure type.') // // This family requires the vector lane size to be at least 32 bits // (so there's room for an address in each lane at all). It has no // widening/narrowing variants. But it does support preindex // writeback, in which the address vector is updated to hold the // addresses actually loaded from. // Base class. class MVE_VLDRSTR_qi : MVE_VLDRSTR_base:$addr)), asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> { bits<11> addr; let Inst{23} = addr{7}; let Inst{19-17} = addr{10-8}; let Inst{16} = 0; let Inst{8} = memsz.encoding{0}; // enough to distinguish 32- from 64-bit let Inst{7} = 0; let Inst{6-0} = addr{6-0}; } // Multiclass that generates the non-writeback and writeback variants. multiclass MVE_VLDRSTR_qi_m { def "" : MVE_VLDRSTR_qi; def _pre : MVE_VLDRSTR_qi { let DecoderMethod="DecodeMVE_MEM_3_pre<"#memsz.shift#">"; } } // Actual instruction definitions. defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m; defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m; defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m; defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m; // Define aliases for all the instructions where memory size and // vector lane size are the same. These are mnemonic aliases, so they // apply consistently across all of the above families - contiguous // loads, and both the rq and qi types of gather/scatter. // // Rationale: As long as you're loading (for example) 16-bit memory // values into 16-bit vector lanes, you can think of them as signed or // unsigned integers, fp16 or just raw 16-bit blobs and it makes no // difference. So we permit all of vldrh.16, vldrh.u16, vldrh.s16, // vldrh.f16 and treat them all as equivalent to the canonical // spelling (which happens to be .u16 for loads, and just .16 for // stores). foreach vpt_cond = ["", "t", "e"] in foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in foreach suffix = memsz.suffixes in { // These foreaches are conceptually ifs, implemented by iterating a // dummy variable over a list with 0 or 1 elements depending on the // condition. The idea is to iterate over _nearly_ all the suffixes // in memsz.suffixes, but omit the one we want all the others to alias. foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []) in def : MnemonicAlias< "vldr" # memsz.MnemonicLetter # vpt_cond # suffix, "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>; foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []) in def : MnemonicAlias< "vstr" # memsz.MnemonicLetter # vpt_cond # suffix, "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>; } // end of MVE predicable load/store class MVE_VPT size, dag iops, string asm, list pattern=[]> : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> { bits<3> fc; bits<4> Mk; bits<3> Qn; let Inst{31-23} = 0b111111100; let Inst{22} = Mk{3}; let Inst{21-20} = size; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b1; let Inst{15-13} = Mk{2-0}; let Inst{12} = fc{2}; let Inst{11-8} = 0b1111; let Inst{7} = fc{0}; let Inst{4} = 0b0; let Defs = [VPR]; let validForTailPredication = 1; } class MVE_VPTt1 size, dag iops> : MVE_VPT { bits<4> Qm; bits<4> Mk; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; let validForTailPredication = 1; } class MVE_VPTt1i size> : MVE_VPTt1 { let Inst{12} = 0b0; let Inst{0} = 0b0; } def MVE_VPTv4i32 : MVE_VPTt1i<"i32", 0b10>; def MVE_VPTv8i16 : MVE_VPTt1i<"i16", 0b01>; def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>; class MVE_VPTt1u size> : MVE_VPTt1 { let Inst{12} = 0b0; let Inst{0} = 0b1; } def MVE_VPTv4u32 : MVE_VPTt1u<"u32", 0b10>; def MVE_VPTv8u16 : MVE_VPTt1u<"u16", 0b01>; def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>; class MVE_VPTt1s size> : MVE_VPTt1 { let Inst{12} = 0b1; } def MVE_VPTv4s32 : MVE_VPTt1s<"s32", 0b10>; def MVE_VPTv8s16 : MVE_VPTt1s<"s16", 0b01>; def MVE_VPTv16s8 : MVE_VPTt1s<"s8", 0b00>; class MVE_VPTt2 size, dag iops> : MVE_VPT { bits<4> Rm; bits<3> fc; bits<4> Mk; let Inst{6} = 0b1; let Inst{5} = fc{1}; let Inst{3-0} = Rm{3-0}; } class MVE_VPTt2i size> : MVE_VPTt2 { let Inst{12} = 0b0; let Inst{5} = 0b0; } def MVE_VPTv4i32r : MVE_VPTt2i<"i32", 0b10>; def MVE_VPTv8i16r : MVE_VPTt2i<"i16", 0b01>; def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>; class MVE_VPTt2u size> : MVE_VPTt2 { let Inst{12} = 0b0; let Inst{5} = 0b1; } def MVE_VPTv4u32r : MVE_VPTt2u<"u32", 0b10>; def MVE_VPTv8u16r : MVE_VPTt2u<"u16", 0b01>; def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>; class MVE_VPTt2s size> : MVE_VPTt2 { let Inst{12} = 0b1; } def MVE_VPTv4s32r : MVE_VPTt2s<"s32", 0b10>; def MVE_VPTv8s16r : MVE_VPTt2s<"s16", 0b01>; def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>; class MVE_VPTf pattern=[]> : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> { bits<3> fc; bits<4> Mk; bits<3> Qn; let Inst{31-29} = 0b111; let Inst{28} = size; let Inst{27-23} = 0b11100; let Inst{22} = Mk{3}; let Inst{21-20} = 0b11; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b1; let Inst{15-13} = Mk{2-0}; let Inst{12} = fc{2}; let Inst{11-8} = 0b1111; let Inst{7} = fc{0}; let Inst{4} = 0b0; let Defs = [VPR]; let Predicates = [HasMVEFloat]; let validForTailPredication = 1; } class MVE_VPTft1 : MVE_VPTf { bits<3> fc; bits<4> Qm; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; } def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>; def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>; class MVE_VPTft2 : MVE_VPTf { bits<3> fc; bits<4> Rm; let Inst{6} = 0b1; let Inst{5} = fc{1}; let Inst{3-0} = Rm{3-0}; } def MVE_VPTv4f32r : MVE_VPTft2<"f32", 0b0>; def MVE_VPTv8f16r : MVE_VPTft2<"f16", 0b1>; def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary, !strconcat("vpst", "${Mk}"), "", "", []> { bits<4> Mk; let Inst{31-23} = 0b111111100; let Inst{22} = Mk{3}; let Inst{21-16} = 0b110001; let Inst{15-13} = Mk{2-0}; let Inst{12-0} = 0b0111101001101; let Unpredictable{12} = 0b1; let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; let Uses = [VPR]; let validForTailPredication = 1; } def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> { bits<4> Qn; bits<4> Qd; bits<4> Qm; let Inst{28} = 0b1; let Inst{25-23} = 0b100; let Inst{22} = Qd{3}; let Inst{21-20} = 0b11; let Inst{19-17} = Qn{2-0}; let Inst{16} = 0b1; let Inst{15-13} = Qd{2-0}; let Inst{12-9} = 0b0111; let Inst{8} = 0b1; let Inst{7} = Qn{3}; let Inst{6} = 0b0; let Inst{5} = Qm{3}; let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; let validForTailPredication = 1; } foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f32"] in def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm", (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; let Predicates = [HasMVEInt] in { def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>; def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; // Pred <-> Int def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))), (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))), (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))), (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))), (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>; def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))), (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>; def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))), (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>; } let Predicates = [HasMVEFloat] in { // Pred <-> Float // 112 is 1.0 in float def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))), (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; // 2620 in 1.0 in half def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))), (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; // 240 is -1.0 in float def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))), (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; // 2748 is -1.0 in half def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))), (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))), (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; } def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary, "vpnot", "", "", vpred_n, "", []> { let Inst{31-0} = 0b11111110001100010000111101001101; let Unpredictable{19-17} = 0b111; let Unpredictable{12} = 0b1; let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; let Constraints = ""; let DecoderMethod = "DecodeMVEVPNOT"; } let Predicates = [HasMVEInt] in { def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))), (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>; def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))), (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>; def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))), (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>; } class MVE_loltp_start size> : t2LOL<(outs GPRlr:$LR), iops, asm, ops> { bits<4> Rn; let Predicates = [HasMVEInt]; let Inst{22} = 0b0; let Inst{21-20} = size; let Inst{19-16} = Rn{3-0}; let Inst{12} = 0b0; } class MVE_DLSTP size> : MVE_loltp_start<(ins rGPR:$Rn), asm, "$LR, $Rn", size> { let Inst{13} = 0b1; let Inst{11-1} = 0b00000000000; let Unpredictable{10-1} = 0b1111111111; } class MVE_WLSTP size> : MVE_loltp_start<(ins rGPR:$Rn, wlslabel_u11:$label), asm, "$LR, $Rn, $label", size> { bits<11> label; let Inst{13} = 0b0; let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; } def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>; def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>; def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>; def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>; def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>; def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>; def MVE_WLSTP_64 : MVE_WLSTP<"wlstp.64", 0b11>; class MVE_loltp_end : t2LOL { let Predicates = [HasMVEInt]; let Inst{22-21} = 0b00; let Inst{19-16} = 0b1111; let Inst{12} = 0b0; } def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout), (ins GPRlr:$LRin, lelabel_u11:$label), "letp", "$LRin, $label"> { bits<11> label; let Inst{20} = 0b1; let Inst{13} = 0b0; let Inst{11} = label{0}; let Inst{10-1} = label{10-1}; } def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { let Inst{20} = 0b0; let Inst{13} = 0b1; let Inst{11-1} = 0b00000000000; let Unpredictable{21-20} = 0b11; let Unpredictable{11-1} = 0b11111111111; } //===----------------------------------------------------------------------===// // Patterns //===----------------------------------------------------------------------===// class MVE_vector_store_typed : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7:$addr), (RegImmInst (Ty MQPR:$val), t2addrmode_imm7:$addr)>; class MVE_vector_maskedstore_typed : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7:$addr, VCCR:$pred), (RegImmInst (Ty MQPR:$val), t2addrmode_imm7:$addr, (i32 1), VCCR:$pred)>; multiclass MVE_vector_store { def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; } class MVE_vector_load_typed : Pat<(Ty (LoadKind t2addrmode_imm7:$addr)), (Ty (RegImmInst t2addrmode_imm7:$addr))>; class MVE_vector_maskedload_typed : Pat<(Ty (LoadKind t2addrmode_imm7:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), (Ty (RegImmInst t2addrmode_imm7:$addr, (i32 1), VCCR:$pred))>; multiclass MVE_vector_load { def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; } class MVE_vector_offset_store_typed : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset:$addr), (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset:$addr)>; multiclass MVE_vector_offset_store { def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; } def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), (pre_store node:$val, node:$ptr, node:$offset), [{ return cast(N)->getAlignment() >= 4; }]>; def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), (post_store node:$val, node:$ptr, node:$offset), [{ return cast(N)->getAlignment() >= 4; }]>; def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), (pre_store node:$val, node:$ptr, node:$offset), [{ return cast(N)->getAlignment() >= 2; }]>; def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), (post_store node:$val, node:$ptr, node:$offset), [{ return cast(N)->getAlignment() >= 2; }]>; def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (masked_ld node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast(N); return Ld->getMemoryVT().getScalarType() == MVT::i8; }]>; def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ return cast(N)->getExtensionType() == ISD::SEXTLOAD; }]>; def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ return cast(N)->getExtensionType() == ISD::ZEXTLOAD; }]>; def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (masked_ld node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; }]>; def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ return cast(N)->getExtensionType() == ISD::SEXTLOAD; }]>; def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ return cast(N)->getExtensionType() == ISD::ZEXTLOAD; }]>; def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; }]>; def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), (masked_ld node:$ptr, node:$pred, node:$passthru), [{ auto *Ld = cast(N); EVT ScalarVT = Ld->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; }]>; def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred), [{ return cast(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (maskedstore8 node:$val, node:$ptr, node:$pred), [{ return cast(N)->isTruncatingStore(); }]>; def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred), [{ auto *St = cast(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; }]>; def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (maskedstore16 node:$val, node:$ptr, node:$pred), [{ return cast(N)->isTruncatingStore(); }]>; def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, node:$pred), [{ auto *St = cast(N); EVT ScalarVT = St->getMemoryVT().getScalarType(); return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; }]>; let Predicates = [HasMVEInt, IsLE] in { // Stores defm : MVE_vector_store; defm : MVE_vector_store; defm : MVE_vector_store; // Loads defm : MVE_vector_load; defm : MVE_vector_load; defm : MVE_vector_load; // Pre/post inc stores defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; defm : MVE_vector_offset_store; } let Predicates = [HasMVEInt, IsBE] in { // Aligned Stores def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; def : MVE_vector_store_typed; // Aligned Loads def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; def : MVE_vector_load_typed; // Other unaligned loads/stores need to go though a VREV def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)), (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)), (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)), (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)), (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)), (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)), (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr), (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; // Pre/Post inc stores def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; def : MVE_vector_offset_store_typed; } let Predicates = [HasMVEInt] in { // Aligned masked store, shared between LE and BE def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; def : MVE_vector_maskedstore_typed; // Truncating stores def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; // Aligned masked loads def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; def : MVE_vector_maskedload_typed; // Extending masked loads. def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))), (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))), (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; } // Widening/Narrowing Loads/Stores let MinAlignment = 2 in { def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr), (truncstorevi16 node:$val, node:$ptr)>; def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), (post_truncstvi16 node:$val, node:$base, node:$offset)>; def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), (pre_truncstvi16 node:$val, node:$base, node:$offset)>; } let Predicates = [HasMVEInt] in { def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr), (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>; def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr), (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>; def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr), (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>; def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; } let MinAlignment = 2 in { def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; } multiclass MVEExtLoad { def _Any : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) (!cast("extloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _Z : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) (!cast("zextloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _S : Pat<(!cast("v" # DestLanes # "i" # DestElemBits) (!cast("sextloadvi" # SrcElemBits # Align) am:$addr)), (!cast("MVE_VLDR" # SrcElemType # "S" # DestElemBits) am:$addr)>; } let Predicates = [HasMVEInt] in { defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>; defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>; defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>; } // Bit convert patterns let Predicates = [HasMVEInt] in { def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>; } let Predicates = [IsLE,HasMVEInt] in { def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>; } let Predicates = [IsBE,HasMVEInt] in { def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>; def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>; def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>; def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>; def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>; def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll new file mode 100644 index 000000000000..a8036a3ce3a9 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +%struct.float16x8x2_t = type { [2 x <8 x half>] } +%struct.uint8x16x4_t = type { [4 x <16 x i8>] } +%struct.uint32x4x2_t = type { [2 x <4 x i32>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } + +define arm_aapcs_vfpcc %struct.float16x8x2_t @test_vld2q_f16(half* %addr) { +; CHECK-LABEL: test_vld2q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half* %addr) + %1 = extractvalue { <8 x half>, <8 x half> } %0, 0 + %2 = insertvalue %struct.float16x8x2_t undef, <8 x half> %1, 0, 0 + %3 = extractvalue { <8 x half>, <8 x half> } %0, 1 + %4 = insertvalue %struct.float16x8x2_t %2, <8 x half> %3, 0, 1 + ret %struct.float16x8x2_t %4 +} + +declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half*) + +define arm_aapcs_vfpcc %struct.uint8x16x4_t @test_vld4q_u8(i8* %addr) { +; CHECK-LABEL: test_vld4q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0i8(i8* %addr) + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 0 + %2 = insertvalue %struct.uint8x16x4_t undef, <16 x i8> %1, 0, 0 + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 1 + %4 = insertvalue %struct.uint8x16x4_t %2, <16 x i8> %3, 0, 1 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 2 + %6 = insertvalue %struct.uint8x16x4_t %4, <16 x i8> %5, 0, 2 + %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 3 + %8 = insertvalue %struct.uint8x16x4_t %6, <16 x i8> %7, 0, 3 + ret %struct.uint8x16x4_t %8 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0i8(i8*) + +define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) { +; CHECK-LABEL: test_vst2q_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: vst20.32 {q0, q1}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 1 + tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 0) + tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 1) + ret void +} + +declare void @llvm.arm.mve.vst2q.p0i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32) + +define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) { +; CHECK-LABEL: test_vst2q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: vst20.16 {q0, q1}, [r0] +; CHECK-NEXT: vst21.16 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 1 + call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 0) + call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 1) + ret void +} + +declare void @llvm.arm.mve.vst2q.p0f16.v8f16(half*, <8 x half>, <8 x half>, i32) + +define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) { +; CHECK-LABEL: test_vst4q_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 1 + %value.coerce.fca.0.2.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 2 + %value.coerce.fca.0.3.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 3 + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 0) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 1) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 2) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 3) + ret void +} + +declare void @llvm.arm.mve.vst4q.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)