diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1407,6 +1407,13 @@ Expr *SizeExpr, SourceLocation AttrLoc) const; + /// Return the unique reference to a vector type of the specified + /// element type and its width is the same as the register size. + /// + /// \pre \p VectorType must be a built-in type. + QualType getRegisterSizedVectorType(QualType VectorType, + VectorType::VectorKind VecKind) const; + /// Return the unique reference to the matrix type of the specified element /// type and size /// diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -38,6 +38,8 @@ // V -> Vector, followed by the number of elements and the base type. // q -> Scalable vector, followed by the number of elements and the base type. // E -> ext_vector, followed by the number of elements and the base type. +// e -> Register sized Vector, followd by the base type and +// its width is the same as the register size. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t // P -> FILE diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def --- a/clang/include/clang/Basic/BuiltinsRISCV.def +++ b/clang/include/clang/Basic/BuiltinsRISCV.def @@ -17,5 +17,699 @@ #include "clang/Basic/riscv_vector_builtins.inc" +// P extension + +// add8 +TARGET_BUILTIN(__rv__add8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uadd8, "eUceUceUc", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sadd8, "eSceSceSc", "", "experimental-p") + +// add16 +TARGET_BUILTIN(__rv__add16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uadd16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sadd16, "eSseSseSs", "", "experimental-p") + +// ave +TARGET_BUILTIN(__rv__ave, "SLiSLiSLi", "", "experimental-p") + +// bitrev +TARGET_BUILTIN(__rv__bitrev, "ULiULiULi", "", "experimental-p") + +// bpick +TARGET_BUILTIN(__rv__bpick, "ULiULiULiULi", "", "experimental-p") + +// clrs8 +TARGET_BUILTIN(__rv__clrs8, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clrs8, "eUceSc", "", "experimental-p") + +// clrs16 +TARGET_BUILTIN(__rv__clrs16, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clrs16, "eUseSs", "", "experimental-p") + +// clrs32 +TARGET_BUILTIN(__rv__clrs32, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clrs32, "V2UiV2Si", "", "experimental-p") + +// clo8 +TARGET_BUILTIN(__rv__clo8, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clo8, "eUceSc", "", "experimental-p") + +// clo16 +TARGET_BUILTIN(__rv__clo16, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clo16, "eUseSs", "", "experimental-p") + +// clo32 +TARGET_BUILTIN(__rv__clo32, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clo32, "V2UiV2Si", "", "experimental-p") + +// clz8 +TARGET_BUILTIN(__rv__clz8, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clz8, "eUceSc", "", "experimental-p") + +// clz16 +TARGET_BUILTIN(__rv__clz16, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clz16, "eUseSs", "", "experimental-p") + +// clz32 +TARGET_BUILTIN(__rv__clz32, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_clz32, "V2UiV2Si", "", "experimental-p") + +// cmpeq8 +TARGET_BUILTIN(__rv__cmpeq8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmpeq8, "eUceUceUc", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmpeq8, "eUceSceSc", "", "experimental-p") + +// cmpeq16 +TARGET_BUILTIN(__rv__cmpeq16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmpeq16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmpeq16, "eUseSseSs", "", "experimental-p") + +// cras16 +TARGET_BUILTIN(__rv__cras16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucras16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scras16, "eSseSseSs", "", "experimental-p") + +// crsa16 +TARGET_BUILTIN(__rv__crsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucrsa16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scrsa16, "eSseSseSs", "", "experimental-p") + +// insb +TARGET_BUILTIN(__rv__insb, "ULiULiULiULi", "", "experimental-p") + +// kabs8 +TARGET_BUILTIN(__rv__kabs8, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kabs8, "eSceSc", "", "experimental-p") + +// kabs16 +TARGET_BUILTIN(__rv__kabs16, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kabs16, "eSseSs", "", "experimental-p") + +// kabsw +TARGET_BUILTIN(__rv__kabsw, "ULiSLi", "", "experimental-p") + +// kadd8 +TARGET_BUILTIN(__rv__kadd8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kadd8, "eSceSceSc", "", "experimental-p") + +// kadd16 +TARGET_BUILTIN(__rv__kadd16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kadd16, "eSseSseSs", "", "experimental-p") + +// kaddh +TARGET_BUILTIN(__rv__kaddh, "LiLiLi", "", "experimental-p") + +// kaddw +TARGET_BUILTIN(__rv__kaddw, "LiLiLi", "", "experimental-p") + +// kcras16 +TARGET_BUILTIN(__rv__kcras16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kcras16, "eSseSseSs", "", "experimental-p") + +// kcrsa16 +TARGET_BUILTIN(__rv__kcrsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kcrsa16, "eSseSseSs", "", "experimental-p") + +// kdmbb +TARGET_BUILTIN(__rv__kdmbb, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmbb, "LieSseSs", "", "experimental-p") +// kdmbt +TARGET_BUILTIN(__rv__kdmbt, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmbt, "LieSseSs", "", "experimental-p") +// kdmtt +TARGET_BUILTIN(__rv__kdmtt, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmtt, "LieSseSs", "", "experimental-p") + +// kdmabb +TARGET_BUILTIN(__rv__kdmabb, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmabb, "LiLieSseSs", "", "experimental-p") +// kdmabt +TARGET_BUILTIN(__rv__kdmabt, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmabt, "LiLieSseSs", "", "experimental-p") +// kdmatt +TARGET_BUILTIN(__rv__kdmatt, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kdmatt, "LiLieSseSs", "", "experimental-p") + +// khm8 +TARGET_BUILTIN(__rv__khm8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khm8, "eSceSceSc", "", "experimental-p") +// khmx8 +TARGET_BUILTIN(__rv__khmx8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khmx8, "eSceSceSc", "", "experimental-p") + +// khm16 +TARGET_BUILTIN(__rv__khm16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khm16, "eSseSseSs", "", "experimental-p") +// khmx16 +TARGET_BUILTIN(__rv__khmx16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khmx16, "eSseSseSs", "", "experimental-p") + +// khmbb +TARGET_BUILTIN(__rv__khmbb, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khmbb, "LieSseSs", "", "experimental-p") +// khmbt +TARGET_BUILTIN(__rv__khmbt, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khmbt, "LieSseSs", "", "experimental-p") +// khmtt +TARGET_BUILTIN(__rv__khmtt, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_khmtt, "LieSseSs", "", "experimental-p") + +// kmabb +TARGET_BUILTIN(__rv__kmabb, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmabb, "eSieSieSseSs", "", "experimental-p") +// kmabt +TARGET_BUILTIN(__rv__kmabt, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmabt, "eSieSieSseSs", "", "experimental-p") +// kmatt +TARGET_BUILTIN(__rv__kmatt, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmatt, "eSieSieSseSs", "", "experimental-p") + +// kmada +TARGET_BUILTIN(__rv__kmada, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmada, "eSieSieSseSs", "", "experimental-p") +// kmaxda +TARGET_BUILTIN(__rv__kmaxda, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmaxda, "eSieSieSseSs", "", "experimental-p") + +// kmads +TARGET_BUILTIN(__rv__kmads, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmads, "eSieSieSseSs", "", "experimental-p") +// kmadrs +TARGET_BUILTIN(__rv__kmadrs, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmadrs, "eSieSieSseSs", "", "experimental-p") +// kmaxds +TARGET_BUILTIN(__rv__kmaxds, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmaxds, "eSieSieSseSs", "", "experimental-p") + +// kmda +TARGET_BUILTIN(__rv__kmda, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmda, "eSieSseSs", "", "experimental-p") +// kmxda +TARGET_BUILTIN(__rv__kmxda, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmxda, "eSieSseSs", "", "experimental-p") + +// kmmac +TARGET_BUILTIN(__rv__kmmac, "LiLiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmac, "V2SiV2SiV2SiV2Si", "", "experimental-p") +// kmmac.u +TARGET_BUILTIN(__rv__kmmac_u, "LiLiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmac_u, "V2SiV2SiV2SiV2Si", "", "experimental-p") + +// kmmawb +TARGET_BUILTIN(__rv__kmmawb, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawb, "eSieSieSieSs", "", "experimental-p") +// kmmawb_u +TARGET_BUILTIN(__rv__kmmawb_u, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawb_u, "eSieSieSieSs", "", "experimental-p") + +// kmmawb2 +TARGET_BUILTIN(__rv__kmmawb2, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawb2, "eSieSieSieSs", "", "experimental-p") +// kmmawb2_u +TARGET_BUILTIN(__rv__kmmawb2_u, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawb2_u, "eSieSieSieSs", "", "experimental-p") + +// kmmawt +TARGET_BUILTIN(__rv__kmmawt, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawt, "eSieSieSieSs", "", "experimental-p") +// kmmawt_u +TARGET_BUILTIN(__rv__kmmawt_u, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawt_u, "eSieSieSieSs", "", "experimental-p") + +// kmmawt2 +TARGET_BUILTIN(__rv__kmmawt2, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawt2, "eSieSieSieSs", "", "experimental-p") +// kmmawt2_u +TARGET_BUILTIN(__rv__kmmawt2_u, "LiLiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmawt2_u, "eSieSieSieSs", "", "experimental-p") + +// kmmsb +TARGET_BUILTIN(__rv__kmmsb, "LiLiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmsb, "V2SiV2SiV2SiV2Si", "", "experimental-p") +// kmmsb.u +TARGET_BUILTIN(__rv__kmmsb_u, "LiLiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmsb_u, "V2SiV2SiV2SiV2Si", "", "experimental-p") + +// kmmwb2 +TARGET_BUILTIN(__rv__kmmwb2, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmwb2, "eSieSieSs", "", "experimental-p") +// kmmwb2_u +TARGET_BUILTIN(__rv__kmmwb2_u, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmwb2_u, "eSieSieSs", "", "experimental-p") + +// kmmwt2 +TARGET_BUILTIN(__rv__kmmwt2, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmwt2, "eSieSieSs", "", "experimental-p") +// kmmwt2_u +TARGET_BUILTIN(__rv__kmmwt2_u, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmmwt2_u, "eSieSieSs", "", "experimental-p") + +// kmsda +TARGET_BUILTIN(__rv__kmsda, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmsda, "eSieSieSseSs", "", "experimental-p") +// kmsxda +TARGET_BUILTIN(__rv__kmsxda, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kmsxda, "eSieSieSseSs", "", "experimental-p") + +// ksllw +TARGET_BUILTIN(__rv__ksllw, "LiLiULi", "", "experimental-p") + +// ksll8 +TARGET_BUILTIN(__rv__ksll8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ksll8, "eSceScULi", "", "experimental-p") + +// ksll16 +TARGET_BUILTIN(__rv__ksll16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ksll16, "eSseSsULi", "", "experimental-p") + +// kslra8 +TARGET_BUILTIN(__rv__kslra8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kslra8, "eSceScULi", "", "experimental-p") +// kslra8_u +TARGET_BUILTIN(__rv__kslra8_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kslra8_u, "eSceScULi", "", "experimental-p") + +// kslra16 +TARGET_BUILTIN(__rv__kslra16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kslra16, "eSseSsULi", "", "experimental-p") +// kslra16_u +TARGET_BUILTIN(__rv__kslra16_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kslra16_u, "eSseSsULi", "", "experimental-p") + +// kstas16 +TARGET_BUILTIN(__rv__kstas16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kstas16, "eSseSseSs", "", "experimental-p") + +// kstsa16 +TARGET_BUILTIN(__rv__kstsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kstsa16, "eSseSseSs", "", "experimental-p") + +// ksub8 +TARGET_BUILTIN(__rv__ksub8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ksub8, "eSceSceSc", "", "experimental-p") + +// ksub16 +TARGET_BUILTIN(__rv__ksub16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ksub16, "eSseSseSs", "", "experimental-p") + +// ksubh +TARGET_BUILTIN(__rv__ksubh, "LiLiLi", "", "experimental-p") + +// ksubw +TARGET_BUILTIN(__rv__ksubw, "LiLiLi", "", "experimental-p") + +// kwmmul +TARGET_BUILTIN(__rv__kwmmul, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kwmmul, "V2SiV2SiV2Si", "", "experimental-p") +// kwmmul_u +TARGET_BUILTIN(__rv__kwmmul_u, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_kwmmul_u, "V2SiV2SiV2Si", "", "experimental-p") + +// maxw +TARGET_BUILTIN(__rv__maxw, "LiLiLi", "", "experimental-p") + +// minw +TARGET_BUILTIN(__rv__minw, "LiLiLi", "", "experimental-p") + +// pbsad +TARGET_BUILTIN(__rv__pbsad, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pbsad, "ULieUceUc", "", "experimental-p") + +// pbsada +TARGET_BUILTIN(__rv__pbsada, "ULiULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pbsada, "ULiULieUceUc", "", "experimental-p") + +// pkbb16 +TARGET_BUILTIN(__rv__pkbb16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pkbb16, "eUseUseUs", "", "experimental-p") +// pkbt16 +TARGET_BUILTIN(__rv__pkbt16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pkbt16, "eUseUseUs", "", "experimental-p") +// pktt16 +TARGET_BUILTIN(__rv__pktt16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pktt16, "eUseUseUs", "", "experimental-p") +// pktb16 +TARGET_BUILTIN(__rv__pktb16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_pktb16, "eUseUseUs", "", "experimental-p") + +// radd8 +TARGET_BUILTIN(__rv__radd8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_radd8, "eSceSceSc", "", "experimental-p") + +// radd16 +TARGET_BUILTIN(__rv__radd16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_radd16, "eSseSseSs", "", "experimental-p") + +// raddw +TARGET_BUILTIN(__rv__raddw, "LiLiLi", "", "experimental-p") + +// rcras16 +TARGET_BUILTIN(__rv__rcras16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rcras16, "eSseSseSs", "", "experimental-p") + +// rcrsa16 +TARGET_BUILTIN(__rv__rcrsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rcrsa16, "eSseSseSs", "", "experimental-p") + +// rstas16 +TARGET_BUILTIN(__rv__rstas16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rstas16, "eSseSseSs", "", "experimental-p") + +// rstsa16 +TARGET_BUILTIN(__rv__rstsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rstsa16, "eSseSseSs", "", "experimental-p") + +// rsub8 +TARGET_BUILTIN(__rv__rsub8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rsub8, "eSceSceSc", "", "experimental-p") + +// rsub16 +TARGET_BUILTIN(__rv__rsub16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_rsub16, "eSseSseSs", "", "experimental-p") + +// rsubw +TARGET_BUILTIN(__rv__rsubw, "LiLiLi", "", "experimental-p") + +// sclip8 +TARGET_BUILTIN(__rv__sclip8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sclip8, "eSceScULi", "", "experimental-p") + +// sclip16 +TARGET_BUILTIN(__rv__sclip16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sclip16, "eSseSsULi", "", "experimental-p") + +// sclip32 +TARGET_BUILTIN(__rv__sclip32, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sclip32, "V2SiV2SiULi", "", "experimental-p") + +// scmple8 +TARGET_BUILTIN(__rv__scmple8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmple8, "eUceSceSc", "", "experimental-p") + +// scmple16 +TARGET_BUILTIN(__rv__scmple16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmple16, "eUseSseSs", "", "experimental-p") + +// scmplt8 +TARGET_BUILTIN(__rv__scmplt8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmplt8, "eUceSceSc", "", "experimental-p") + +// scmplt16 +TARGET_BUILTIN(__rv__scmplt16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_scmplt16, "eUseSseSs", "", "experimental-p") + +// sll8 +TARGET_BUILTIN(__rv__sll8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sll8, "eUceUcULi", "", "experimental-p") + +// sll16 +TARGET_BUILTIN(__rv__sll16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sll16, "eUseUsULi", "", "experimental-p") + +// smaqa +TARGET_BUILTIN(__rv__smaqa, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smaqa, "eSieSieSceSc", "", "experimental-p") +// smaqa_su +TARGET_BUILTIN(__rv__smaqa_su, "LiLiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smaqa_su, "eSieSieSceSc", "", "experimental-p") + +// smax8 +TARGET_BUILTIN(__rv__smax8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smax8, "eSceSceSc", "", "experimental-p") + +// smax16 +TARGET_BUILTIN(__rv__smax16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smax16, "eSseSseSs", "", "experimental-p") + +// smbb16 +TARGET_BUILTIN(__rv__smbb16, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smbb16, "eSieSseSs", "", "experimental-p") +// smbt16 +TARGET_BUILTIN(__rv__smbt16, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smbt16, "eSieSseSs", "", "experimental-p") +// smtt16 +TARGET_BUILTIN(__rv__smtt16, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smtt16, "eSieSseSs", "", "experimental-p") + +// smds +TARGET_BUILTIN(__rv__smds, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smds, "eSieSseSs", "", "experimental-p") +// smdrs +TARGET_BUILTIN(__rv__smdrs, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smdrs, "eSieSseSs", "", "experimental-p") +// smxds +TARGET_BUILTIN(__rv__smxds, "LiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smxds, "eSieSseSs", "", "experimental-p") + +// smin8 +TARGET_BUILTIN(__rv__smin8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smin8, "eSceSceSc", "", "experimental-p") + +// smin16 +TARGET_BUILTIN(__rv__smin16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smin16, "eSseSseSs", "", "experimental-p") + +// smmul +TARGET_BUILTIN(__rv__smmul, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmul, "V2SiV2SiV2Si", "", "experimental-p") +// smmul_u +TARGET_BUILTIN(__rv__smmul_u, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmul_u, "V2SiV2SiV2Si", "", "experimental-p") + +// smmwb +TARGET_BUILTIN(__rv__smmwb, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmwb, "eSieSieSs", "", "experimental-p") +// smmwb_u +TARGET_BUILTIN(__rv__smmwb_u, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmwb_u, "eSieSieSs", "", "experimental-p") + +// smmwt +TARGET_BUILTIN(__rv__smmwt, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmwt, "eSieSieSs", "", "experimental-p") +// smmwt_u +TARGET_BUILTIN(__rv__smmwt_u, "LiLiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_smmwt_u, "eSieSieSs", "", "experimental-p") + +// sra_u +TARGET_BUILTIN(__rv__sra_u, "LiLiULi", "", "experimental-p") + +// sra8 +TARGET_BUILTIN(__rv__sra8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sra8, "eSceScULi", "", "experimental-p") +// sra8_u +TARGET_BUILTIN(__rv__sra8_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sra8_u, "eSceScULi", "", "experimental-p") + +// sra16 +TARGET_BUILTIN(__rv__sra16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sra16, "eSseSsULi", "", "experimental-p") +// sra16_u +TARGET_BUILTIN(__rv__sra16_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sra16_u, "eSseSsULi", "", "experimental-p") + +// srl8 +TARGET_BUILTIN(__rv__srl8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_srl8, "eUceUcULi", "", "experimental-p") +// srl8_u +TARGET_BUILTIN(__rv__srl8_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_srl8_u, "eUceUcULi", "", "experimental-p") + +// srl16 +TARGET_BUILTIN(__rv__srl16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_srl16, "eUseUsULi", "", "experimental-p") +// srl16_u +TARGET_BUILTIN(__rv__srl16_u, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_srl16_u, "eUseUsULi", "", "experimental-p") + +// stas16 +TARGET_BUILTIN(__rv__stas16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ustas16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sstas16, "eSseSseSs", "", "experimental-p") + +// stsa16 +TARGET_BUILTIN(__rv__stsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ustsa16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sstsa16, "eSseSseSs", "", "experimental-p") + +// sub8 +TARGET_BUILTIN(__rv__sub8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_usub8, "eUceUceUc", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ssub8, "eSceSceSc", "", "experimental-p") + +// sub16 +TARGET_BUILTIN(__rv__sub16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_usub16, "eUseUseUs", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ssub16, "eSseSseSs", "", "experimental-p") + +// sunpkd810 +TARGET_BUILTIN(__rv__sunpkd810, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sunpkd810, "eSseSc", "", "experimental-p") +// sunpkd820 +TARGET_BUILTIN(__rv__sunpkd820, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sunpkd820, "eSseSc", "", "experimental-p") +// sunpkd830 +TARGET_BUILTIN(__rv__sunpkd830, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sunpkd830, "eSseSc", "", "experimental-p") +// sunpkd831 +TARGET_BUILTIN(__rv__sunpkd831, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sunpkd831, "eSseSc", "", "experimental-p") +// sunpkd832 +TARGET_BUILTIN(__rv__sunpkd832, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_sunpkd832, "eSseSc", "", "experimental-p") + +// swap8 +TARGET_BUILTIN(__rv__swap8, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_swap8, "eUceSc", "", "experimental-p") + +// swap16 +TARGET_BUILTIN(__rv__swap16, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_swap16, "eUseSs", "", "experimental-p") + +// uclip8 +TARGET_BUILTIN(__rv__uclip8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uclip8, "eSceScULi", "", "experimental-p") + +// uclip16 +TARGET_BUILTIN(__rv__uclip16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uclip16, "eSseSsULi", "", "experimental-p") + +// uclip32 +TARGET_BUILTIN(__rv__uclip32, "LiLiLi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uclip32, "V2SiV2SiULi", "", "experimental-p") + +// ucmple8 +TARGET_BUILTIN(__rv__ucmple8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmple8, "eUceSceSc", "", "experimental-p") + +// ucmple16 +TARGET_BUILTIN(__rv__ucmple16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmple16, "eUseSseSs", "", "experimental-p") + +// ucmplt8 +TARGET_BUILTIN(__rv__ucmplt8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmplt8, "eUceSceSc", "", "experimental-p") + +// ucmplt16 +TARGET_BUILTIN(__rv__ucmplt16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ucmplt16, "eUseSseSs", "", "experimental-p") + +// ukadd8 +TARGET_BUILTIN(__rv__ukadd8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukadd8, "eUceUceUc", "", "experimental-p") + +// ukadd16 +TARGET_BUILTIN(__rv__ukadd16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukadd16, "eUseUseUs", "", "experimental-p") + +// ukaddh +TARGET_BUILTIN(__rv__ukaddh, "ULiULiULi", "", "experimental-p") + +// ukaddw +TARGET_BUILTIN(__rv__ukaddw, "ULiULiULi", "", "experimental-p") + +// ukcras16 +TARGET_BUILTIN(__rv__ukcras16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukcras16, "eUseUseUs", "", "experimental-p") + +// ukcrsa16 +TARGET_BUILTIN(__rv__ukcrsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukcrsa16, "eUseUseUs", "", "experimental-p") + +// ukstas16 +TARGET_BUILTIN(__rv__ukstas16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukstas16, "eSseSseSs", "", "experimental-p") + +// ukstsa16 +TARGET_BUILTIN(__rv__ukstsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ukstsa16, "eSseSseSs", "", "experimental-p") + +// uksub8 +TARGET_BUILTIN(__rv__uksub8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uksub8, "eSceSceSc", "", "experimental-p") + +// uksub16 +TARGET_BUILTIN(__rv__uksub16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uksub16, "eSseSseSs", "", "experimental-p") + +// uksubh +TARGET_BUILTIN(__rv__uksubh, "LiLiLi", "", "experimental-p") + +// uksubw +TARGET_BUILTIN(__rv__uksubw, "LiLiLi", "", "experimental-p") + +// umaqa +TARGET_BUILTIN(__rv__umaqa, "ULiULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_umaqa, "eUieUieUceUc", "", "experimental-p") + +// umax8 +TARGET_BUILTIN(__rv__umax8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_umax8, "eSceSceSc", "", "experimental-p") + +// umax16 +TARGET_BUILTIN(__rv__umax16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_umax16, "eSseSseSs", "", "experimental-p") + +// umin8 +TARGET_BUILTIN(__rv__umin8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_umin8, "eSceSceSc", "", "experimental-p") + +// umin16 +TARGET_BUILTIN(__rv__umin16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_umin16, "eSseSseSs", "", "experimental-p") + +// uradd8 +TARGET_BUILTIN(__rv__uradd8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uradd8, "eSceSceSc", "", "experimental-p") + +// uradd16 +TARGET_BUILTIN(__rv__uradd16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_uradd16, "eSseSseSs", "", "experimental-p") + +// uraddw +TARGET_BUILTIN(__rv__uraddw, "LiLiLi", "", "experimental-p") + +// urcras16 +TARGET_BUILTIN(__rv__urcras16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_urcras16, "eSseSseSs", "", "experimental-p") + +// urcrsa16 +TARGET_BUILTIN(__rv__urcrsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_urcrsa16, "eSseSseSs", "", "experimental-p") + +// urstas16 +TARGET_BUILTIN(__rv__urstas16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_urstas16, "eSseSseSs", "", "experimental-p") + +// urstsa16 +TARGET_BUILTIN(__rv__urstsa16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_urstsa16, "eSseSseSs", "", "experimental-p") + +// ursub8 +TARGET_BUILTIN(__rv__ursub8, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ursub8, "eSceSceSc", "", "experimental-p") + +// ursub16 +TARGET_BUILTIN(__rv__ursub16, "ULiULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_ursub16, "eSseSseSs", "", "experimental-p") + +// ursubw +TARGET_BUILTIN(__rv__ursubw, "LiLiLi", "", "experimental-p") + +// zunpkd810 +TARGET_BUILTIN(__rv__zunpkd810, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_zunpkd810, "eSseSc", "", "experimental-p") +// zunpkd820 +TARGET_BUILTIN(__rv__zunpkd820, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_zunpkd820, "eSseSc", "", "experimental-p") +// zunpkd830 +TARGET_BUILTIN(__rv__zunpkd830, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_zunpkd830, "eSseSc", "", "experimental-p") +// zunpkd831 +TARGET_BUILTIN(__rv__zunpkd831, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_zunpkd831, "eSseSc", "", "experimental-p") +// zunpkd832 +TARGET_BUILTIN(__rv__zunpkd832, "ULiULi", "", "experimental-p") +TARGET_BUILTIN(__rv__v_zunpkd832, "eSseSc", "", "experimental-p") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11170,4 +11170,7 @@ // RISC-V V-extension def err_riscvv_builtin_requires_v : Error< "builtin requires 'V' extension support to be enabled">; +// RISC-V P-extension +def err_riscvv_builtin_requires_p : Error< + "builtin requires 'P' extension support to be enabled">; } // end of sema component. diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -4030,6 +4030,44 @@ return QualType(New, 0); } +/// getRegisterSizedVectorType - Return the unique reference to a vector type of +/// the specified element type and its width is the same as the register size. +/// VectorType must be a built-in type. +QualType +ASTContext::getRegisterSizedVectorType(QualType vecType, + VectorType::VectorKind VecKind) const { + assert(vecType->isBuiltinType()); + + unsigned NumElts = getTargetInfo().getRegisterWidth() / getTypeSize(vecType); + + if (NumElts <= 1) + return vecType; + + // Check if we've already instantiated a vector of this type. + llvm::FoldingSetNodeID ID; + VectorType::Profile(ID, vecType, NumElts, Type::Vector, VecKind); + + void *InsertPos = nullptr; + if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos)) + return QualType(VTP, 0); + + // If the element type isn't canonical, this won't be a canonical type either, + // so fill in the canonical type field. + QualType Canonical; + if (!vecType.isCanonical()) { + Canonical = getVectorType(getCanonicalType(vecType), NumElts, VecKind); + + // Get the new insert position for the node we care about. + VectorType *NewIP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos); + assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP; + } + auto *New = new (*this, TypeAlignment) + VectorType(vecType, NumElts, Canonical, VecKind); + VectorTypes.InsertNode(New, InsertPos); + Types.push_back(New); + return QualType(New, 0); +} + QualType ASTContext::getConstantMatrixType(QualType ElementTy, unsigned NumRows, unsigned NumColumns) const { llvm::FoldingSetNodeID ID; @@ -10404,6 +10442,15 @@ Type = Context.getExtVectorType(ElementType, NumElements); break; } + case 'e': { + QualType ElementType = DecodeTypeFromStr(Str, Context, Error, + RequiresICE, false); + assert(!RequiresICE && "Can't require vector ICE"); + + Type = Context.getRegisterSizedVectorType(ElementType, + VectorType::GenericVector); + break; + } case 'X': { QualType ElementType = DecodeTypeFromStr(Str, Context, Error, RequiresICE, false); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17858,6 +17858,262 @@ llvm::SmallVector IntrinsicTypes; switch (BuiltinID) { #include "clang/Basic/riscv_vector_builtin_cg.inc" + + // P extension +#define EMIT_BUILTIN(NAME, INT) \ + case RISCV::BI__rv__##NAME: \ + ID = Intrinsic::riscv_##INT; \ + IntrinsicTypes = { Ops[0]->getType() }; \ + break; + +#define BUILTIN(NAME) \ + EMIT_BUILTIN(NAME, NAME) \ + +#define BUILTIN_WITH_V(NAME) \ + EMIT_BUILTIN(NAME, NAME) \ + EMIT_BUILTIN(v_##NAME, v_##NAME) + +#define BUILTIN_WITH_US_V(NAME) \ + EMIT_BUILTIN(NAME, NAME) \ + EMIT_BUILTIN(v_u##NAME, v_##NAME) \ + EMIT_BUILTIN(v_s##NAME, v_##NAME) + + BUILTIN(kabsw) + BUILTIN(ave) + BUILTIN(bitrev) + BUILTIN(kaddh) + BUILTIN(kaddw) + BUILTIN(ksllw) + BUILTIN(ksubh) + BUILTIN(ksubw) + BUILTIN(maxw) + BUILTIN(minw) + BUILTIN(raddw) + BUILTIN(rsubw) + BUILTIN(sra_u) + BUILTIN(ukaddh) + BUILTIN(ukaddw) + BUILTIN(uksubh) + BUILTIN(uksubw) + BUILTIN(uraddw) + BUILTIN(ursubw) + BUILTIN(bpick) + BUILTIN(insb) + + BUILTIN_WITH_V(clrs8) + BUILTIN_WITH_V(clrs16) + BUILTIN_WITH_V(clrs32) + BUILTIN_WITH_V(clo8) + BUILTIN_WITH_V(clo16) + BUILTIN_WITH_V(clo32) + BUILTIN_WITH_V(clz8) + BUILTIN_WITH_V(clz16) + BUILTIN_WITH_V(clz32) + BUILTIN_WITH_V(kabs8) + BUILTIN_WITH_V(kabs16) + BUILTIN_WITH_V(swap8) + BUILTIN_WITH_V(swap16) + BUILTIN_WITH_V(kadd8) + BUILTIN_WITH_V(kadd16) + BUILTIN_WITH_V(kcras16) + BUILTIN_WITH_V(kcrsa16) + BUILTIN_WITH_V(khm8) + BUILTIN_WITH_V(khmx8) + BUILTIN_WITH_V(khm16) + BUILTIN_WITH_V(khmx16) + BUILTIN_WITH_V(kstas16) + BUILTIN_WITH_V(kstsa16) + BUILTIN_WITH_V(ksub8) + BUILTIN_WITH_V(ksub16) + BUILTIN_WITH_V(kwmmul) + BUILTIN_WITH_V(kwmmul_u) + BUILTIN_WITH_V(pkbb16) + BUILTIN_WITH_V(pkbt16) + BUILTIN_WITH_V(pktt16) + BUILTIN_WITH_V(pktb16) + BUILTIN_WITH_V(radd8) + BUILTIN_WITH_V(radd16) + BUILTIN_WITH_V(rcras16) + BUILTIN_WITH_V(rcrsa16) + BUILTIN_WITH_V(rstas16) + BUILTIN_WITH_V(rstsa16) + BUILTIN_WITH_V(rsub8) + BUILTIN_WITH_V(rsub16) + BUILTIN_WITH_V(scmple8) + BUILTIN_WITH_V(scmple16) + BUILTIN_WITH_V(scmplt8) + BUILTIN_WITH_V(scmplt16) + BUILTIN_WITH_V(smax8) + BUILTIN_WITH_V(smax16) + BUILTIN_WITH_V(smin8) + BUILTIN_WITH_V(smin16) + BUILTIN_WITH_V(smmul) + BUILTIN_WITH_V(smmul_u) + BUILTIN_WITH_V(ucmple8) + BUILTIN_WITH_V(ucmple16) + BUILTIN_WITH_V(ucmplt8) + BUILTIN_WITH_V(ucmplt16) + BUILTIN_WITH_V(ukadd8) + BUILTIN_WITH_V(ukadd16) + BUILTIN_WITH_V(ukcras16) + BUILTIN_WITH_V(ukcrsa16) + BUILTIN_WITH_V(ukstas16) + BUILTIN_WITH_V(ukstsa16) + BUILTIN_WITH_V(uksub8) + BUILTIN_WITH_V(uksub16) + BUILTIN_WITH_V(umax8) + BUILTIN_WITH_V(umax16) + BUILTIN_WITH_V(umin8) + BUILTIN_WITH_V(umin16) + BUILTIN_WITH_V(uradd8) + BUILTIN_WITH_V(uradd16) + BUILTIN_WITH_V(urcras16) + BUILTIN_WITH_V(urcrsa16) + BUILTIN_WITH_V(urstas16) + BUILTIN_WITH_V(urstsa16) + BUILTIN_WITH_V(ursub8) + BUILTIN_WITH_V(ursub16) + BUILTIN_WITH_V(kmmac) + BUILTIN_WITH_V(kmmac_u) + BUILTIN_WITH_V(kmmsb) + BUILTIN_WITH_V(kmmsb_u) + + BUILTIN_WITH_US_V(add8) + BUILTIN_WITH_US_V(add16) + BUILTIN_WITH_US_V(cmpeq8) + BUILTIN_WITH_US_V(cmpeq16) + BUILTIN_WITH_US_V(cras16) + BUILTIN_WITH_US_V(crsa16) + BUILTIN_WITH_US_V(stas16) + BUILTIN_WITH_US_V(stsa16) + BUILTIN_WITH_US_V(sub8) + BUILTIN_WITH_US_V(sub16) + +#define BUILTIN_PKD(NAME) \ + case RISCV::BI__rv__##NAME: \ + ID = Intrinsic::riscv_##NAME; \ + IntrinsicTypes = { ConvertType(E->getType()) }; \ + break; + +#define BUILTIN_PKD_WITH_V(NAME) \ + BUILTIN(NAME) \ + BUILTIN_PKD(v_##NAME) + + BUILTIN_PKD_WITH_V(sunpkd810) + BUILTIN_PKD_WITH_V(sunpkd820) + BUILTIN_PKD_WITH_V(sunpkd830) + BUILTIN_PKD_WITH_V(sunpkd831) + BUILTIN_PKD_WITH_V(sunpkd832) + BUILTIN_PKD_WITH_V(zunpkd810) + BUILTIN_PKD_WITH_V(zunpkd820) + BUILTIN_PKD_WITH_V(zunpkd830) + BUILTIN_PKD_WITH_V(zunpkd831) + BUILTIN_PKD_WITH_V(zunpkd832) + +#define BUILTIN_AB(NAME) \ + case RISCV::BI__rv__##NAME: \ + ID = Intrinsic::riscv_##NAME; \ + IntrinsicTypes = { ConvertType(E->getType()), Ops[0]->getType() }; \ + break; + +#define BUILTIN_AB_WITH_V(NAME) \ + BUILTIN(NAME) \ + BUILTIN_AB(v_##NAME) + + BUILTIN_AB_WITH_V(kdmbb) + BUILTIN_AB_WITH_V(kdmbt) + BUILTIN_AB_WITH_V(kdmtt) + BUILTIN_AB_WITH_V(khmbb) + BUILTIN_AB_WITH_V(khmbt) + BUILTIN_AB_WITH_V(khmtt) + BUILTIN_AB_WITH_V(kmda) + BUILTIN_AB_WITH_V(kmxda) + BUILTIN_AB_WITH_V(pbsad) + BUILTIN_AB_WITH_V(smbb16) + BUILTIN_AB_WITH_V(smbt16) + BUILTIN_AB_WITH_V(smtt16) + BUILTIN_AB_WITH_V(smds) + BUILTIN_AB_WITH_V(smdrs) + BUILTIN_AB_WITH_V(smxds) + +#define BUILTIN_AAB(NAME) \ + case RISCV::BI__rv__##NAME: \ + ID = Intrinsic::riscv_##NAME; \ + IntrinsicTypes = { ConvertType(E->getType()), Ops[1]->getType() }; \ + break; + +#define BUILTIN_AAB_WITH_V(NAME) \ + BUILTIN(NAME) \ + BUILTIN_AAB(v_##NAME) + + BUILTIN_AAB_WITH_V(kmmwb2) + BUILTIN_AAB_WITH_V(kmmwb2_u) + BUILTIN_AAB_WITH_V(kmmwt2) + BUILTIN_AAB_WITH_V(kmmwt2_u) + BUILTIN_AAB_WITH_V(smmwb) + BUILTIN_AAB_WITH_V(smmwb_u) + BUILTIN_AAB_WITH_V(smmwt) + BUILTIN_AAB_WITH_V(smmwt_u) + + BUILTIN_AAB_WITH_V(ksll8) + BUILTIN_AAB_WITH_V(ksll16) + BUILTIN_AAB_WITH_V(kslra8) + BUILTIN_AAB_WITH_V(kslra8_u) + BUILTIN_AAB_WITH_V(kslra16) + BUILTIN_AAB_WITH_V(kslra16_u) + BUILTIN_AAB_WITH_V(sclip8) + BUILTIN_AAB_WITH_V(sclip16) + BUILTIN_AAB_WITH_V(sclip32) + BUILTIN_AAB_WITH_V(sll8) + BUILTIN_AAB_WITH_V(sll16) + BUILTIN_AAB_WITH_V(sra8) + BUILTIN_AAB_WITH_V(sra8_u) + BUILTIN_AAB_WITH_V(sra16) + BUILTIN_AAB_WITH_V(sra16_u) + BUILTIN_AAB_WITH_V(srl8) + BUILTIN_AAB_WITH_V(srl8_u) + BUILTIN_AAB_WITH_V(srl16) + BUILTIN_AAB_WITH_V(srl16_u) + BUILTIN_AAB_WITH_V(uclip8) + BUILTIN_AAB_WITH_V(uclip16) + BUILTIN_AAB_WITH_V(uclip32) + + BUILTIN_AAB_WITH_V(kdmabb) + BUILTIN_AAB_WITH_V(kdmabt) + BUILTIN_AAB_WITH_V(kdmatt) + BUILTIN_AAB_WITH_V(kmabb) + BUILTIN_AAB_WITH_V(kmabt) + BUILTIN_AAB_WITH_V(kmatt) + BUILTIN_AAB_WITH_V(kmada) + BUILTIN_AAB_WITH_V(kmaxda) + BUILTIN_AAB_WITH_V(kmads) + BUILTIN_AAB_WITH_V(kmadrs) + BUILTIN_AAB_WITH_V(kmaxds) + BUILTIN_AAB_WITH_V(kmsda) + BUILTIN_AAB_WITH_V(kmsxda) + BUILTIN_AAB_WITH_V(pbsada) + BUILTIN_AAB_WITH_V(smaqa) + BUILTIN_AAB_WITH_V(smaqa_su) + BUILTIN_AAB_WITH_V(umaqa) + +#define BUILTIN_AAAB(NAME) \ + case RISCV::BI__rv__##NAME: \ + ID = Intrinsic::riscv_##NAME; \ + IntrinsicTypes = { ConvertType(E->getType()), Ops[2]->getType() }; \ + break; + +#define BUILTIN_AAAB_WITH_V(NAME) \ + BUILTIN(NAME) \ + BUILTIN_AAAB(v_##NAME) + + BUILTIN_AAAB_WITH_V(kmmawb) + BUILTIN_AAAB_WITH_V(kmmawb_u) + BUILTIN_AAAB_WITH_V(kmmawb2) + BUILTIN_AAAB_WITH_V(kmmawb2_u) + BUILTIN_AAAB_WITH_V(kmmawt) + BUILTIN_AAAB_WITH_V(kmmawt_u) + BUILTIN_AAAB_WITH_V(kmmawt2) + BUILTIN_AAAB_WITH_V(kmmawt2_u) } assert(ID != Intrinsic::not_intrinsic); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3400,6 +3400,10 @@ !TI.hasFeature("experimental-v")) return Diag(TheCall->getBeginLoc(), diag::err_riscvv_builtin_requires_v) << TheCall->getSourceRange(); + if (Features.find("experimental-p") != StringRef::npos && + !TI.hasFeature("experimental-p")) + return Diag(TheCall->getBeginLoc(), diag::err_riscvv_builtin_requires_p) + << TheCall->getSourceRange(); return false; } diff --git a/clang/test/CodeGen/builtins-riscv-rv32p.c b/clang/test/CodeGen/builtins-riscv-rv32p.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-riscv-rv32p.c @@ -0,0 +1,906 @@ +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv32-unknown-elf -target-feature +experimental-p \ +// RUN: -emit-llvm -o - %s | FileCheck %s -check-prefix=RV32 + +typedef signed char int8x4_t __attribute((vector_size(4))); +typedef signed char int8x8_t __attribute((vector_size(8))); +typedef short int16x2_t __attribute((vector_size(4))); +typedef short int16x4_t __attribute__((vector_size(8))); +typedef short int16x8_t __attribute__((vector_size(16))); +typedef int int32x2_t __attribute__((vector_size(8))); +typedef int int32x4_t __attribute__((vector_size(16))); +typedef unsigned char uint8x4_t __attribute__((vector_size(4))); +typedef unsigned char uint8x8_t __attribute__((vector_size(8))); +typedef unsigned short uint16x2_t __attribute__((vector_size(4))); +typedef unsigned short uint16x4_t __attribute__((vector_size(8))); +typedef unsigned short uint16x8_t __attribute__((vector_size(16))); +typedef unsigned int uint32x2_t __attribute__((vector_size(8))); +typedef unsigned int uint32x4_t __attribute__((vector_size(16))); + +void test(void) { + int i_t = 0, i_a = 0, i_b = 1; + int i_r; + + unsigned int ui_t = 0, ui_a = 1, ui_b = 2; + unsigned int ui_r; + + long l_t = 0, l_a = 1, l_b = 2; + long l_r; + + unsigned long ul_t = 0, ul_a = 1, ul_b = 2, ul_c = 3; + unsigned long ul_r; + + long long ll_t = 0, ll_a = 1, ll_b = 2; + long long ll_r; + + unsigned long long ull_t = 0, ull_a = 1, ull_b = 2; + unsigned long long ull_r; + + int8x4_t i8x4_a = {0, 1, 2, 3}; + int8x4_t i8x4_b = {0, 1, 2, 3}; + int8x4_t i8x4_r; + + uint8x4_t u8x4_a = {0, 1, 2, 3}; + uint8x4_t u8x4_b = {0, 1, 2, 3}; + uint8x4_t u8x4_r; + + int16x2_t i16x2_a = {0, 1}; + int16x2_t i16x2_b = {0, 1}; + int16x2_t i16x2_r; + + uint16x2_t u16x2_a = {0, 1}; + uint16x2_t u16x2_b = {0, 1}; + uint16x2_t u16x2_r; + + int16x4_t i16x4_r; + + uint16x4_t u16x4_r; + + int32x2_t i32x2_r; + + uint32x2_t u32x2_r; + + // RV32: call i32 @llvm.riscv.add8.i32 + ul_r = __rv__add8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.add8 + u8x4_r = __rv__v_uadd8(u8x4_a, u8x4_b); + // RV32: call <4 x i8> @llvm.riscv.v.add8 + i8x4_r = __rv__v_sadd8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.add16.i32 + ul_r = __rv__add16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.add16 + u16x2_r = __rv__v_uadd16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.add16 + i16x2_r = __rv__v_sadd16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.ave.i32 + l_r = __rv__ave(l_a, l_b); + + // RV32: call i32 @llvm.riscv.bitrev.i32 + ul_r = __rv__bitrev(ul_a, ul_b); + + // RV32: call i32 @llvm.riscv.bpick.i32 + ul_r = __rv__bpick(ul_a, ul_b, ul_c); + + // RV32: call i32 @llvm.riscv.clrs8.i32 + ul_r = __rv__clrs8(ul_a); + // RV32: call <4 x i8> @llvm.riscv.v.clrs8 + u8x4_r = __rv__v_clrs8(i8x4_a); + + // RV32: call i32 @llvm.riscv.clrs16.i32 + ul_r = __rv__clrs16(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.clrs16 + u16x2_r = __rv__v_clrs16(i16x2_a); + + // RV32: call i32 @llvm.riscv.clrs32.i32 + ul_r = __rv__clrs32(ul_a); + + // RV32: call i32 @llvm.riscv.clo8.i32 + ul_r = __rv__clo8(ul_a); + // RV32: call <4 x i8> @llvm.riscv.v.clo8 + u8x4_r = __rv__v_clo8(u8x4_a); + + // RV32: call i32 @llvm.riscv.clo16.i32 + ul_r = __rv__clo16(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.clo16 + u16x2_r = __rv__v_clo16(u16x2_a); + + // RV32: call i32 @llvm.riscv.clo32.i32 + ul_r = __rv__clo32(ul_a); + + // RV32: call i32 @llvm.riscv.clz8.i32 + ul_r = __rv__clz8(ul_a); + // RV32: call <4 x i8> @llvm.riscv.v.clz8 + u8x4_r = __rv__v_clz8(u8x4_a); + + // RV32: call i32 @llvm.riscv.clz16.i32 + ul_r = __rv__clz16(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.clz16 + u16x2_r = __rv__v_clz16(u16x2_a); + + // RV32: call i32 @llvm.riscv.clz32.i32 + ul_r = __rv__clz32(ul_a); + + // RV32: call i32 @llvm.riscv.cmpeq8.i32 + ul_r = __rv__cmpeq8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.cmpeq8 + u8x4_r = __rv__v_scmpeq8(i8x4_a, i8x4_b); + // RV32: call <4 x i8> @llvm.riscv.v.cmpeq8 + u8x4_r = __rv__v_ucmpeq8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.cmpeq16.i32 + ul_r = __rv__cmpeq16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.cmpeq16 + u16x2_r = __rv__v_scmpeq16(i16x2_a, i16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.cmpeq16 + u16x2_r = __rv__v_ucmpeq16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.cras16.i32 + ul_r = __rv__cras16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.cras16 + u16x2_r = __rv__v_ucras16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.cras16 + i16x2_r = __rv__v_scras16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.crsa16.i32 + ul_r = __rv__crsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.crsa16 + u16x2_r = __rv__v_ucrsa16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.crsa16 + i16x2_r = __rv__v_scrsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.insb.i32 + ul_r = __rv__insb(ul_a, ul_b, 3); + + // RV32: call i32 @llvm.riscv.kabs8.i32 + ul_r = __rv__kabs8(ul_a); + // RV32: call <4 x i8> @llvm.riscv.v.kabs8 + i8x4_r = __rv__v_kabs8(i8x4_a); + + // RV32: call i32 @llvm.riscv.kabs16.i32 + ul_r = __rv__kabs16(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.kabs16 + i16x2_r = __rv__v_kabs16(i16x2_a); + + // RV32: call i32 @llvm.riscv.kabsw.i32 + l_r = __rv__kabsw(l_a); + + // RV32: call i32 @llvm.riscv.kadd8.i32 + ul_r = __rv__kadd8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.kadd8 + i8x4_r = __rv__v_kadd8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.kadd16.i32 + ul_r = __rv__kadd16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.kadd16 + i16x2_r = __rv__v_kadd16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kaddh.i32 + l_r = __rv__kaddh(i_a, i_b); + + // RV32: call i32 @llvm.riscv.kaddw.i32 + l_r = __rv__kaddw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.kcras16.i32 + ul_r = __rv__kcras16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.kcras16 + i16x2_r = __rv__v_kcras16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kcrsa16.i32 + ul_r = __rv__kcrsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.kcrsa16 + i16x2_r = __rv__v_kcrsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kdmbb.i32 + l_r = __rv__kdmbb(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmbb + l_r = __rv__v_kdmbb(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kdmbt.i32 + l_r = __rv__kdmbt(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmbt + l_r = __rv__v_kdmbt(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kdmtt.i32 + l_r = __rv__kdmtt(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmtt + l_r = __rv__v_kdmtt(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kdmabb.i32 + l_r = __rv__kdmabb(l_t, ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmabb + l_r = __rv__v_kdmabb(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kdmabt.i32 + l_r = __rv__kdmabt(l_t, ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmabt + l_r = __rv__v_kdmabt(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kdmatt.i32 + l_r = __rv__kdmatt(l_t, ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.kdmatt + l_r = __rv__v_kdmatt(l_t, i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.khm8.i32 + ul_r = __rv__khm8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.khm8 + i8x4_r = __rv__v_khm8(i8x4_a, i8x4_b); + // RV32: call i32 @llvm.riscv.khmx8.i32 + ul_r = __rv__khmx8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.khmx8 + i8x4_r = __rv__v_khmx8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.khm16.i32 + ul_r = __rv__khm16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.khm16 + i16x2_r = __rv__v_khm16(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.khmx16.i32 + ul_r = __rv__khmx16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.khmx16 + i16x2_r = __rv__v_khmx16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.khmbb.i32 + l_r = __rv__khmbb(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.khmbb + l_r = __rv__v_khmbb(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.khmbt.i32 + l_r = __rv__khmbt(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.khmbt + l_r = __rv__v_khmbt(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.khmtt.i32 + l_r = __rv__khmtt(ui_a, ui_b); + // RV32: call i32 @llvm.riscv.v.khmtt + l_r = __rv__v_khmtt(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmabb.i32 + l_r = __rv__kmabb(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmabb + l_r = __rv__v_kmabb(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmabt.i32 + l_r = __rv__kmabt(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmabt + l_r = __rv__v_kmabt(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmatt.i32 + l_r = __rv__kmatt(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmatt + l_r = __rv__v_kmatt(l_t, i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmada.i32 + l_r = __rv__kmada(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmada + l_r = __rv__v_kmada(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmaxda.i32 + l_r = __rv__kmaxda(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmaxda + l_r = __rv__v_kmaxda(l_t, i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmads.i32 + l_r = __rv__kmads(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmads + l_r = __rv__v_kmads(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmadrs.i32 + l_r = __rv__kmadrs(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmadrs + l_r = __rv__v_kmadrs(l_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmaxds.i32 + l_r = __rv__kmaxds(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmaxds + l_r = __rv__v_kmaxds(l_t, i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmda.i32 + l_r = __rv__kmda(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmda + l_r = __rv__v_kmda(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmxda.i32 + l_r = __rv__kmxda(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmxda + l_r = __rv__v_kmxda(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmac.i32 + l_r = __rv__kmmac(l_t, l_a, l_b); + // RV32: call i32 @llvm.riscv.kmmac.u.i32 + l_r = __rv__kmmac_u(l_t, l_a, l_b); + + // RV32: call i32 @llvm.riscv.kmmawb.i32 + l_r = __rv__kmmawb(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawb + i_r = __rv__v_kmmawb(i_t, i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmawb.u.i32 + l_r = __rv__kmmawb_u(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawb.u + i_r = __rv__v_kmmawb_u(i_t, i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmawb2.i32 + l_r = __rv__kmmawb2(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawb2 + i_r = __rv__v_kmmawb2(i_t, i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmawb2.u.i32 + l_r = __rv__kmmawb2_u(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawb2.u + i_r = __rv__v_kmmawb2_u(i_t, i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmawt.i32 + l_r = __rv__kmmawt(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawt + i_r = __rv__v_kmmawt(i_t, i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmawt.u.i32 + l_r = __rv__kmmawt_u(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawt.u + i_r = __rv__v_kmmawt_u(i_t, i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmawt2.i32 + l_r = __rv__kmmawt2(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawt2 + i_r = __rv__v_kmmawt2(i_t, i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmawt2.u.i32 + l_r = __rv__kmmawt2_u(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmawt2.u + i_r = __rv__v_kmmawt2_u(i_t, i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmsb.i32 + l_r = __rv__kmmsb(l_t, l_a, l_b); + // RV32: call i32 @llvm.riscv.kmmsb.u.i32 + l_r = __rv__kmmsb_u(l_t, l_a, l_b); + + // RV32: call i32 @llvm.riscv.kmmwb2.i32 + l_r = __rv__kmmwb2(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmwb2 + i_r = __rv__v_kmmwb2(i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmwb2.u.i32 + l_r = __rv__kmmwb2_u(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmwb2.u + i_r = __rv__v_kmmwb2_u(i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmmwt2.i32 + l_r = __rv__kmmwt2(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmwt2 + i_r = __rv__v_kmmwt2(i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmmwt2.u.i32 + l_r = __rv__kmmwt2_u(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmmwt2.u + i_r = __rv__v_kmmwt2_u(i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kmsda.i32 + l_r = __rv__kmsda(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmsda + i_r = __rv__v_kmsda(i_t, i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.kmsxda.i32 + l_r = __rv__kmsxda(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.kmsxda + i_r = __rv__v_kmsxda(i_t, i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.ksllw.i32 + l_r = __rv__ksllw(l_a, i_b); + + // RV32: call i32 @llvm.riscv.ksll8.i32 + ul_r = __rv__ksll8(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.ksll8 + i8x4_r = __rv__v_ksll8(i8x4_a, ui_b); + + // RV32: call i32 @llvm.riscv.ksll16.i32 + ul_r = __rv__ksll16(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.ksll16 + i16x2_r = __rv__v_ksll16(i16x2_a, ui_b); + + // RV32: call i32 @llvm.riscv.kslra8.i32 + ul_r = __rv__kslra8(ul_a, i_b); + // RV32: call <4 x i8> @llvm.riscv.v.kslra8 + i8x4_r = __rv__v_kslra8(i8x4_a, i_b); + // RV32: call i32 @llvm.riscv.kslra8.u.i32 + ul_r = __rv__kslra8_u(ul_a, i_b); + // RV32: call <4 x i8> @llvm.riscv.v.kslra8.u + i8x4_r = __rv__v_kslra8_u(i8x4_a, i_b); + + // RV32: call i32 @llvm.riscv.kslra16.i32 + ul_r = __rv__kslra16(ul_a, i_b); + // RV32: call <2 x i16> @llvm.riscv.v.kslra16 + i16x2_r = __rv__v_kslra16(i16x2_a, i_b); + // RV32: call i32 @llvm.riscv.kslra16.u.i32 + ul_r = __rv__kslra16_u(ul_a, i_b); + // RV32: call <2 x i16> @llvm.riscv.v.kslra16.u + i16x2_r = __rv__v_kslra16_u(i16x2_a, i_b); + + // RV32: call i32 @llvm.riscv.kstas16.i32 + ul_r = __rv__kstas16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.kstas16 + i16x2_r = __rv__v_kstas16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.kstsa16.i32 + ul_r = __rv__kstsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.kstsa16 + i16x2_r = __rv__v_kstsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.ksub8.i32 + ul_r = __rv__ksub8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.ksub8 + i8x4_r = __rv__v_ksub8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.ksub16.i32 + ul_r = __rv__ksub16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ksub16 + i16x2_r = __rv__v_ksub16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.ksubh.i32 + l_r = __rv__ksubh(i_a, i_b); + + // RV32: call i32 @llvm.riscv.ksubw.i32 + l_r = __rv__ksubw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.kwmmul.i32 + l_r = __rv__kwmmul(l_a, l_b); + // RV32: call i32 @llvm.riscv.kwmmul.u.i32 + l_r = __rv__kwmmul_u(l_a, l_b); + + // RV32: call i32 @llvm.riscv.maxw.i32 + l_r = __rv__maxw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.minw.i32 + l_r = __rv__minw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.pbsad.i32 + ul_r = __rv__pbsad(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.pbsad + ui_r = __rv__v_pbsad(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.pbsada.i32 + ul_r = __rv__pbsada(ul_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.pbsada + ui_r = __rv__v_pbsada(ul_t, u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.pkbb16.i32 + ul_r = __rv__pkbb16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.pkbb16 + u16x2_r = __rv__v_pkbb16(u16x2_a, u16x2_b); + // RV32: call i32 @llvm.riscv.pkbt16.i32 + ul_r = __rv__pkbt16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.pkbt16 + u16x2_r = __rv__v_pkbt16(u16x2_a, u16x2_b); + // RV32: call i32 @llvm.riscv.pktt16.i32 + ul_r = __rv__pktt16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.pktt16 + u16x2_r = __rv__v_pktt16(u16x2_a, u16x2_b); + // RV32: call i32 @llvm.riscv.pktb16.i32 + ul_r = __rv__pktb16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.pktb16 + u16x2_r = __rv__v_pktb16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.radd8.i32 + ul_r = __rv__radd8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.radd8 + i8x4_r = __rv__v_radd8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.radd16.i32 + ul_r = __rv__radd16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.radd16 + i16x2_r = __rv__v_radd16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.raddw.i32 + l_r = __rv__raddw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.rcras16.i32 + ul_r = __rv__rcras16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.rcras16 + i16x2_r = __rv__v_rcras16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.rcrsa16.i32 + ul_r = __rv__rcrsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.rcrsa16 + i16x2_r = __rv__v_rcrsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.rstas16.i32 + ul_r = __rv__rstas16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.rstas16 + i16x2_r = __rv__v_rstas16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.rstsa16.i32 + ul_r = __rv__rstsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.rstsa16 + i16x2_r = __rv__v_rstsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.rsub8.i32 + ul_r = __rv__rsub8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.rsub8 + i8x4_r = __rv__v_rsub8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.rsub16.i32 + ul_r = __rv__rsub16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.rsub16 + i16x2_r = __rv__v_rsub16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.rsubw.i32 + l_r = __rv__rsubw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.sclip8.i32 + ul_r = __rv__sclip8(ul_a, 5); + // RV32: call <4 x i8> @llvm.riscv.v.sclip8 + i8x4_r = __rv__v_sclip8(i8x4_a, 5); + + // RV32: call i32 @llvm.riscv.sclip16.i32 + ul_r = __rv__sclip16(ul_a, 6); + // RV32: call <2 x i16> @llvm.riscv.v.sclip16 + i16x2_r = __rv__v_sclip16(i16x2_a, 6); + + // RV32: call i32 @llvm.riscv.sclip32.i32 + l_r = __rv__sclip32(l_a, 7); + + // RV32: call i32 @llvm.riscv.scmple8.i32 + ul_r = __rv__scmple8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.scmple8 + u8x4_r = __rv__v_scmple8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.scmple16.i32 + ul_r = __rv__scmple16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.scmple16 + u16x2_r = __rv__v_scmple16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.scmplt8.i32 + ul_r = __rv__scmplt8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.scmplt8 + u8x4_r = __rv__v_scmplt8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.scmplt16.i32 + ul_r = __rv__scmplt16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.scmplt16 + u16x2_r = __rv__v_scmplt16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.sll8.i32 + ul_r = __rv__sll8(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.sll8 + u8x4_r = __rv__v_sll8(u8x4_a, ui_b); + + // RV32: call i32 @llvm.riscv.sll16.i32 + ul_r = __rv__sll16(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.sll16 + u16x2_r = __rv__v_sll16(u16x2_a, ui_b); + + // RV32: call i32 @llvm.riscv.smaqa.i32 + l_r = __rv__smaqa(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smaqa + i_r = __rv__v_smaqa(i_t, i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.smaqa.su.i32 + l_r = __rv__smaqa_su(l_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smaqa.su + i_r = __rv__v_smaqa_su(i_t, i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.smax8.i32 + ul_r = __rv__smax8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.smax8 + i8x4_r = __rv__v_smax8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.smax16.i32 + ul_r = __rv__smax16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.smax16 + i16x2_r = __rv__v_smax16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.smbb16.i32 + l_r = __rv__smbb16(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smbb16 + i_r = __rv__v_smbb16(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smbt16.i32 + l_r = __rv__smbt16(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smbt16 + i_r = __rv__v_smbt16(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smtt16.i32 + l_r = __rv__smtt16(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smtt16 + i_r = __rv__v_smtt16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.smds.i32 + l_r = __rv__smds(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smds + i_r = __rv__v_smds(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smdrs.i32 + l_r = __rv__smdrs(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smdrs + i_r = __rv__v_smdrs(i16x2_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smxds.i32 + l_r = __rv__smxds(ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smxds + i_r = __rv__v_smxds(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.smin8.i32 + ul_r = __rv__smin8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.smin8 + i8x4_r = __rv__v_smin8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.smin16.i32 + ul_r = __rv__smin16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.smin16 + i16x2_r = __rv__v_smin16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.smmul.i32 + l_r = __rv__smmul(l_a, l_b); + // RV32: call i32 @llvm.riscv.smmul.u.i32 + l_r = __rv__smmul_u(l_a, l_b); + + // RV32: call i32 @llvm.riscv.smmwb.i32 + l_r = __rv__smmwb(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smmwb + i_r = __rv__v_smmwb(i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smmwb.u.i32 + l_r = __rv__smmwb_u(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smmwb.u + i_r = __rv__v_smmwb_u(i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.smmwt.i32 + l_r = __rv__smmwt(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smmwt + i_r = __rv__v_smmwt(i_a, i16x2_b); + // RV32: call i32 @llvm.riscv.smmwt.u.i32 + l_r = __rv__smmwt_u(l_a, ul_b); + // RV32: call i32 @llvm.riscv.v.smmwt.u + i_r = __rv__v_smmwt_u(i_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.sra.u.i32 + l_r = __rv__sra_u(l_a, ui_b); + + // RV32: call i32 @llvm.riscv.sra8.i32 + ul_r = __rv__sra8(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.sra8 + i8x4_r = __rv__v_sra8(i8x4_a, ui_b); + // RV32: call i32 @llvm.riscv.sra8.u.i32 + ul_r = __rv__sra8_u(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.sra8.u + i8x4_r = __rv__v_sra8_u(i8x4_a, ui_b); + + // RV32: call i32 @llvm.riscv.sra16.i32 + ul_r = __rv__sra16(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.sra16 + i16x2_r = __rv__v_sra16(i16x2_a, ui_b); + // RV32: call i32 @llvm.riscv.sra16.u.i32 + ul_r = __rv__sra16_u(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.sra16.u + i16x2_r = __rv__v_sra16_u(i16x2_a, ui_b); + + // RV32: call i32 @llvm.riscv.srl8.i32 + ul_r = __rv__srl8(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.srl8 + u8x4_r = __rv__v_srl8(u8x4_a, ui_b); + // RV32: call i32 @llvm.riscv.srl8.u.i32 + ul_r = __rv__srl8_u(ul_a, ui_b); + // RV32: call <4 x i8> @llvm.riscv.v.srl8.u + u8x4_r = __rv__v_srl8_u(u8x4_a, ui_b); + + // RV32: call i32 @llvm.riscv.srl16.i32 + ul_r = __rv__srl16(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.srl16 + u16x2_r = __rv__v_srl16(u16x2_a, ui_b); + // RV32: call i32 @llvm.riscv.srl16.u.i32 + ul_r = __rv__srl16_u(ul_a, ui_b); + // RV32: call <2 x i16> @llvm.riscv.v.srl16.u + u16x2_r = __rv__v_srl16_u(u16x2_a, ui_b); + + // RV32: call i32 @llvm.riscv.stas16.i32 + ul_r = __rv__stas16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.stas16 + u16x2_r = __rv__v_ustas16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.stas16 + i16x2_r = __rv__v_sstas16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.stsa16.i32 + ul_r = __rv__stsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.stsa16 + u16x2_r = __rv__v_ustsa16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.stsa16 + i16x2_r = __rv__v_sstsa16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.sub8.i32 + ul_r = __rv__sub8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.sub8 + u8x4_r = __rv__v_usub8(u8x4_a, u8x4_b); + // RV32: call <4 x i8> @llvm.riscv.v.sub8 + i8x4_r = __rv__v_ssub8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.sub16.i32 + ul_r = __rv__sub16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.sub16 + u16x2_r = __rv__v_usub16(u16x2_a, u16x2_b); + // RV32: call <2 x i16> @llvm.riscv.v.sub16 + i16x2_r = __rv__v_ssub16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.sunpkd810.i32 + ul_r = __rv__sunpkd810(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.sunpkd810 + i16x2_r = __rv__v_sunpkd810(i8x4_a); + // RV32: call i32 @llvm.riscv.sunpkd820.i32 + ul_r = __rv__sunpkd820(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.sunpkd820 + i16x2_r = __rv__v_sunpkd820(i8x4_a); + // RV32: call i32 @llvm.riscv.sunpkd830.i32 + ul_r = __rv__sunpkd830(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.sunpkd830 + i16x2_r = __rv__v_sunpkd830(i8x4_a); + // RV32: call i32 @llvm.riscv.sunpkd831.i32 + ul_r = __rv__sunpkd831(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.sunpkd831 + i16x2_r = __rv__v_sunpkd831(i8x4_a); + // RV32: call i32 @llvm.riscv.sunpkd832.i32 + ul_r = __rv__sunpkd832(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.sunpkd832 + i16x2_r = __rv__v_sunpkd832(i8x4_a); + + // RV32: call i32 @llvm.riscv.swap8.i32 + ul_r = __rv__swap8(ul_a); + // RV32: call <4 x i8> @llvm.riscv.v.swap8 + u8x4_r = __rv__v_swap8(u8x4_a); + + // RV32: call i32 @llvm.riscv.swap16.i32 + ul_r = __rv__swap16(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.swap16 + u16x2_r = __rv__v_swap16(u16x2_a); + + // RV32: call i32 @llvm.riscv.uclip8.i32 + ul_r = __rv__uclip8(ul_a, 5); + // RV32: call <4 x i8> @llvm.riscv.v.uclip8 + u8x4_r = __rv__v_uclip8(u8x4_a, 5); + + // RV32: call i32 @llvm.riscv.uclip16.i32 + ul_r = __rv__uclip16(ul_a, 6); + // RV32: call <2 x i16> @llvm.riscv.v.uclip16 + u16x2_r = __rv__v_uclip16(u16x2_a, 6); + + // RV32: call i32 @llvm.riscv.uclip32.i32 + l_r = __rv__uclip32(l_a, 7); + + // RV32: call i32 @llvm.riscv.ucmple8.i32 + ul_r = __rv__ucmple8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.ucmple8 + u8x4_r = __rv__v_ucmple8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.ucmple16.i32 + ul_r = __rv__ucmple16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ucmple16 + u16x2_r = __rv__v_ucmple16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ucmplt8.i32 + ul_r = __rv__ucmplt8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.ucmplt8 + u8x4_r = __rv__v_ucmplt8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.ucmplt16.i32 + ul_r = __rv__ucmplt16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ucmplt16 + u16x2_r = __rv__v_ucmplt16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ukadd8.i32 + ul_r = __rv__ukadd8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.ukadd8 + i8x4_r = __rv__v_ukadd8(i8x4_a, i8x4_b); + + // RV32: call i32 @llvm.riscv.ukadd16.i32 + ul_r = __rv__ukadd16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ukadd16 + i16x2_r = __rv__v_ukadd16(i16x2_a, i16x2_b); + + // RV32: call i32 @llvm.riscv.ukaddh.i32 + l_r = __rv__ukaddh(i_a, i_b); + + // RV32: call i32 @llvm.riscv.ukaddw.i32 + l_r = __rv__ukaddw(i_a, i_b); + + // RV32: call i32 @llvm.riscv.ukcras16.i32 + ul_r = __rv__ukcras16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ukcras16 + u16x2_r = __rv__v_ukcras16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ukcrsa16.i32 + ul_r = __rv__ukcrsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ukcrsa16 + u16x2_r = __rv__v_ukcrsa16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ukstas16.i32 + ul_r = __rv__ukstas16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ukstas16 + u16x2_r = __rv__v_ukstas16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ukstsa16.i32 + ul_r = __rv__ukstsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ukstsa16 + u16x2_r = __rv__v_ukstsa16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.uksub8.i32 + ul_r = __rv__uksub8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.uksub8 + u8x4_r = __rv__v_uksub8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.uksub16.i32 + ul_r = __rv__uksub16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.uksub16 + u16x2_r = __rv__v_uksub16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.uksubh.i32 + ul_r = __rv__uksubh(ui_a, ui_b); + + // RV32: call i32 @llvm.riscv.uksubw.i32 + ul_r = __rv__uksubw(ui_a, ui_b); + + // RV32: call i32 @llvm.riscv.umaqa.i32 + ul_r = __rv__umaqa(ul_t, ul_a, ul_b); + // RV32: call i32 @llvm.riscv.v.umaqa + ui_r = __rv__v_umaqa(ui_t, u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.umax8.i32 + ul_r = __rv__umax8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.umax8 + u8x4_r = __rv__v_umax8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.umax16.i32 + ul_r = __rv__umax16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.umax16 + u16x2_r = __rv__v_umax16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.umin8.i32 + ul_r = __rv__umin8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.umin8 + u8x4_r = __rv__v_umin8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.umin16.i32 + ul_r = __rv__umin16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.umin16 + u16x2_r = __rv__v_umin16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.uradd8.i32 + ul_r = __rv__uradd8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.uradd8 + u8x4_r = __rv__v_uradd8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.uradd16.i32 + ul_r = __rv__uradd16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.uradd16 + u16x2_r = __rv__v_uradd16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.uraddw.i32 + ul_r = __rv__uraddw(ui_a, ui_b); + + // RV32: call i32 @llvm.riscv.urcras16.i32 + ul_r = __rv__urcras16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.urcras16 + u16x2_r = __rv__v_urcras16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.urcrsa16.i32 + ul_r = __rv__urcrsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.urcrsa16 + u16x2_r = __rv__v_urcrsa16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.urstas16.i32 + ul_r = __rv__urstas16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.urstas16 + u16x2_r = __rv__v_urstas16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.urstsa16.i32 + ul_r = __rv__urstsa16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.urstsa16 + u16x2_r = __rv__v_urstsa16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ursub8.i32 + ul_r = __rv__ursub8(ul_a, ul_b); + // RV32: call <4 x i8> @llvm.riscv.v.ursub8 + u8x4_r = __rv__v_ursub8(u8x4_a, u8x4_b); + + // RV32: call i32 @llvm.riscv.ursub16.i32 + ul_r = __rv__ursub16(ul_a, ul_b); + // RV32: call <2 x i16> @llvm.riscv.v.ursub16 + u16x2_r = __rv__v_ursub16(u16x2_a, u16x2_b); + + // RV32: call i32 @llvm.riscv.ursubw.i32 + ul_r = __rv__ursubw(ui_a, ui_b); + + // RV32: call i32 @llvm.riscv.zunpkd810.i32 + ul_r = __rv__zunpkd810(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.zunpkd810 + u16x2_r = __rv__v_zunpkd810(u8x4_a); + // RV32: call i32 @llvm.riscv.zunpkd820.i32 + ul_r = __rv__zunpkd820(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.zunpkd820 + u16x2_r = __rv__v_zunpkd820(u8x4_a); + // RV32: call i32 @llvm.riscv.zunpkd830.i32 + ul_r = __rv__zunpkd830(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.zunpkd830 + u16x2_r = __rv__v_zunpkd830(u8x4_a); + // RV32: call i32 @llvm.riscv.zunpkd831.i32 + ul_r = __rv__zunpkd831(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.zunpkd831 + u16x2_r = __rv__v_zunpkd831(u8x4_a); + // RV32: call i32 @llvm.riscv.zunpkd832.i32 + ul_r = __rv__zunpkd832(ul_a); + // RV32: call <2 x i16> @llvm.riscv.v.zunpkd832 + u16x2_r = __rv__v_zunpkd832(u8x4_a); +} diff --git a/clang/test/CodeGen/builtins-riscv-rv64p.c b/clang/test/CodeGen/builtins-riscv-rv64p.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-riscv-rv64p.c @@ -0,0 +1,943 @@ +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64-unknown-elf -target-feature +experimental-p \ +// RUN: -emit-llvm -o - %s | FileCheck %s -check-prefix=RV64 + +typedef signed char int8x4_t __attribute((vector_size(4))); +typedef signed char int8x8_t __attribute((vector_size(8))); +typedef short int16x2_t __attribute((vector_size(4))); +typedef short int16x4_t __attribute__((vector_size(8))); +typedef short int16x8_t __attribute__((vector_size(16))); +typedef int int32x2_t __attribute__((vector_size(8))); +typedef int int32x4_t __attribute__((vector_size(16))); +typedef unsigned char uint8x4_t __attribute__((vector_size(4))); +typedef unsigned char uint8x8_t __attribute__((vector_size(8))); +typedef unsigned short uint16x2_t __attribute__((vector_size(4))); +typedef unsigned short uint16x4_t __attribute__((vector_size(8))); +typedef unsigned short uint16x8_t __attribute__((vector_size(16))); +typedef unsigned int uint32x2_t __attribute__((vector_size(8))); +typedef unsigned int uint32x4_t __attribute__((vector_size(16))); + +void test(void) { + int i_a = 0, i_b = 1; + + unsigned int ui_a = 0, ui_b = 1; + + long l_t = 0, l_a = 0, l_b = 1; + long l_r; + + unsigned long ul_t = 0, ul_a = 1, ul_b = 2, ul_c = 3; + unsigned long ul_r; + + long long ll_t = 0, ll_a = 1, ll_b = 2; + long long ll_r; + + unsigned long long ull_t = 0, ull_a = 1, ull_b = 2; + unsigned long long ull_r; + + int8x4_t i8x4_a = {0, 1, 2, 3}; + int8x4_t i8x4_b = {0, 1, 2, 3}; + + uint8x4_t u8x4_a = {0, 1, 2, 3}; + uint8x4_t u8x4_b = {0, 1, 2, 3}; + + int8x8_t i8x8_a = {0, 1, 2, 3, 4, 5, 6, 7}; + int8x8_t i8x8_b = {0, 1, 2, 3, 4, 5, 6, 7}; + int8x8_t i8x8_r; + + uint8x8_t u8x8_a = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t u8x8_b = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t u8x8_r; + + int16x2_t i16x2_a = {0, 1}; + int16x2_t i16x2_b = {0, 1}; + + uint16x2_t u16x2_a = {0, 1}; + uint16x2_t u16x2_b = {0, 1}; + + int16x4_t i16x4_a = {0, 1, 2, 3}; + int16x4_t i16x4_b = {0, 1, 2, 3}; + int16x4_t i16x4_r; + + uint16x4_t u16x4_a = {0, 1, 2, 3}; + uint16x4_t u16x4_b = {0, 1, 2, 3}; + uint16x4_t u16x4_r; + + int32x2_t i32x2_t = {0, 1}; + int32x2_t i32x2_a = {0, 1}; + int32x2_t i32x2_b = {0, 1}; + int32x2_t i32x2_r; + + uint32x2_t u32x2_t = {0, 1}; + uint32x2_t u32x2_a = {0, 1}; + uint32x2_t u32x2_b = {0, 1}; + uint32x2_t u32x2_r; + + // RV64: call i64 @llvm.riscv.add8.i64 + ul_r = __rv__add8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.add8 + u8x8_r = __rv__v_uadd8(u8x8_a, u8x8_b); + // RV64: call <8 x i8> @llvm.riscv.v.add8 + i8x8_r = __rv__v_sadd8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.add16.i64 + ul_r = __rv__add16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.add16 + u16x4_r = __rv__v_uadd16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.add16 + i16x4_r = __rv__v_sadd16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.ave.i64 + l_r = __rv__ave(l_a, l_b); + + // RV64: call i64 @llvm.riscv.bitrev.i64 + ul_r = __rv__bitrev(ul_a, ul_b); + + // RV64: call i64 @llvm.riscv.bpick.i64 + ul_r = __rv__bpick(ul_a, ul_b, ul_c); + + // RV64: call i64 @llvm.riscv.clrs8.i64 + ul_r = __rv__clrs8(ul_a); + // RV64: call <8 x i8> @llvm.riscv.v.clrs8 + u8x8_r = __rv__v_clrs8(i8x8_a); + + // RV64: call i64 @llvm.riscv.clrs16.i64 + ul_r = __rv__clrs16(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.clrs16 + u16x4_r = __rv__v_clrs16(i16x4_a); + + // RV64: call i64 @llvm.riscv.clrs32.i64 + ul_r = __rv__clrs32(ul_a); + // RV64: call <2 x i32> @llvm.riscv.v.clrs32 + u32x2_r = __rv__v_clrs32(i32x2_a); + + // RV64: call i64 @llvm.riscv.clo8.i64 + ul_r = __rv__clo8(ul_a); + // RV64: call <8 x i8> @llvm.riscv.v.clo8 + u8x8_r = __rv__v_clo8(i8x8_a); + + // RV64: call i64 @llvm.riscv.clo16.i64 + ul_r = __rv__clo16(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.clo16 + u16x4_r = __rv__v_clo16(i16x4_a); + + // RV64: call i64 @llvm.riscv.clo32.i64 + ul_r = __rv__clo32(ul_a); + // RV64: call <2 x i32> @llvm.riscv.v.clo32 + u32x2_r = __rv__v_clo32(i32x2_a); + + // RV64: call i64 @llvm.riscv.clz8.i64 + ul_r = __rv__clz8(ul_a); + // RV64: call <8 x i8> @llvm.riscv.v.clz8 + u8x8_r = __rv__v_clz8(i8x8_a); + + // RV64: call i64 @llvm.riscv.clz16.i64 + ul_r = __rv__clz16(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.clz16 + u16x4_r = __rv__v_clz16(i16x4_a); + + // RV64: call i64 @llvm.riscv.clz32.i64 + ul_r = __rv__clz32(ul_a); + // RV64: call <2 x i32> @llvm.riscv.v.clz32 + u32x2_r = __rv__v_clz32(i32x2_a); + + // RV64: call i64 @llvm.riscv.cmpeq8.i64 + ul_r = __rv__cmpeq8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.cmpeq8 + u8x8_r = __rv__v_scmpeq8(i8x8_a, i8x8_b); + // RV64: call <8 x i8> @llvm.riscv.v.cmpeq8 + u8x8_r = __rv__v_ucmpeq8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.cmpeq16.i64 + ul_r = __rv__cmpeq16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.cmpeq16 + u16x4_r = __rv__v_scmpeq16(i16x4_a, i16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.cmpeq16 + u16x4_r = __rv__v_ucmpeq16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.cras16.i64 + ul_r = __rv__cras16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.cras16 + u16x4_r = __rv__v_ucras16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.cras16 + i16x4_r = __rv__v_scras16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.crsa16.i64 + ul_r = __rv__crsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.crsa16 + u16x4_r = __rv__v_ucrsa16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.crsa16 + i16x4_r = __rv__v_scrsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.insb.i64 + ul_r = __rv__insb(ul_a, ul_b, 5); + + // RV64: call i64 @llvm.riscv.kabs8.i64 + ul_r = __rv__kabs8(ul_a); + // RV64: call <8 x i8> @llvm.riscv.v.kabs8 + i8x8_r = __rv__v_kabs8(i8x8_a); + + // RV64: call i64 @llvm.riscv.kabs16.i64 + ul_r = __rv__kabs16(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.kabs16 + i16x4_r = __rv__v_kabs16(i16x4_a); + + // RV64: call i64 @llvm.riscv.kabsw.i64 + l_r = __rv__kabsw(l_a); + + // RV64: call i64 @llvm.riscv.kadd8.i64 + ul_r = __rv__kadd8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.kadd8 + i8x8_r = __rv__v_kadd8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.kadd16.i64 + ul_r = __rv__kadd16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.kadd16 + i16x4_r = __rv__v_kadd16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kaddh.i64 + l_r = __rv__kaddh(i_a, i_b); + + // RV64: call i64 @llvm.riscv.kaddw.i64 + l_r = __rv__kaddw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.kcras16.i64 + ul_r = __rv__kcras16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.kcras16 + i16x4_r = __rv__v_kcras16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kcrsa16.i64 + ul_r = __rv__kcrsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.kcrsa16 + i16x4_r = __rv__v_kcrsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kdmbb.i64 + l_r = __rv__kdmbb(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmbb + l_r = __rv__v_kdmbb(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kdmbt.i64 + l_r = __rv__kdmbt(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmbt + l_r = __rv__v_kdmbt(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kdmtt.i64 + l_r = __rv__kdmtt(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmtt + l_r = __rv__v_kdmtt(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kdmabb.i64 + l_r = __rv__kdmabb(l_t, ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmabb + l_r = __rv__v_kdmabb(l_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kdmabt.i64 + l_r = __rv__kdmabt(l_t, ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmabt + l_r = __rv__v_kdmabt(l_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kdmatt.i64 + l_r = __rv__kdmatt(l_t, ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.kdmatt + l_r = __rv__v_kdmatt(l_t, i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.khm8.i64 + ul_r = __rv__khm8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.khm8 + i8x8_r = __rv__v_khm8(i8x8_a, i8x8_b); + // RV64: call i64 @llvm.riscv.khmx8.i64 + ul_r = __rv__khmx8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.khmx8 + i8x8_r = __rv__v_khmx8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.khm16.i64 + ul_r = __rv__khm16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.khm16 + i16x4_r = __rv__v_khm16(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.khmx16.i64 + ul_r = __rv__khmx16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.khmx16 + i16x4_r = __rv__v_khmx16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.khmbb.i64 + l_r = __rv__khmbb(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.khmbb + l_r = __rv__v_khmbb(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.khmbt.i64 + l_r = __rv__khmbt(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.khmbt + l_r = __rv__v_khmbt(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.khmtt.i64 + l_r = __rv__khmtt(ui_a, ui_b); + // RV64: call i64 @llvm.riscv.v.khmtt + l_r = __rv__v_khmtt(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmabb.i64 + l_r = __rv__kmabb(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmabb + i32x2_r = __rv__v_kmabb(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmabt.i64 + l_r = __rv__kmabt(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmabt + i32x2_r = __rv__v_kmabt(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmatt.i64 + l_r = __rv__kmatt(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmatt + i32x2_r = __rv__v_kmatt(i32x2_t, i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmada.i64 + l_r = __rv__kmada(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmada + i32x2_r = __rv__v_kmada(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmaxda.i64 + l_r = __rv__kmaxda(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmaxda + i32x2_r = __rv__v_kmaxda(i32x2_t, i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmads.i64 + l_r = __rv__kmads(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmads + i32x2_r = __rv__v_kmads(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmadrs.i64 + l_r = __rv__kmadrs(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmadrs + i32x2_r = __rv__v_kmadrs(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmaxds.i64 + l_r = __rv__kmaxds(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmaxds + i32x2_r = __rv__v_kmaxds(i32x2_t, i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmda.i64 + l_r = __rv__kmda(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmda + i32x2_r = __rv__v_kmda(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmxda.i64 + l_r = __rv__kmxda(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmxda + i32x2_r = __rv__v_kmxda(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmac.i64 + l_r = __rv__kmmac(l_t, l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmac + i32x2_r = __rv__v_kmmac(i32x2_t, i32x2_a, i32x2_b); + // RV64: call i64 @llvm.riscv.kmmac.u.i64 + l_r = __rv__kmmac_u(l_t, l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmac.u + i32x2_r = __rv__v_kmmac_u(i32x2_t, i32x2_a, i32x2_b); + + // RV64: call i64 @llvm.riscv.kmmawb.i64 + l_r = __rv__kmmawb(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawb + i32x2_r = __rv__v_kmmawb(i32x2_t, i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmawb.u.i64 + l_r = __rv__kmmawb_u(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawb.u + i32x2_r = __rv__v_kmmawb_u(i32x2_t, i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmawb2.i64 + l_r = __rv__kmmawb2(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawb2 + i32x2_r = __rv__v_kmmawb2(i32x2_t, i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmawb2.u.i64 + l_r = __rv__kmmawb2_u(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawb2.u + i32x2_r = __rv__v_kmmawb2_u(i32x2_t, i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmawt.i64 + l_r = __rv__kmmawt(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawt + i32x2_r = __rv__v_kmmawt(i32x2_t, i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmawt.u.i64 + l_r = __rv__kmmawt_u(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawt.u + i32x2_r = __rv__v_kmmawt_u(i32x2_t, i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmawt2.i64 + l_r = __rv__kmmawt2(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawt2 + i32x2_r = __rv__v_kmmawt2(i32x2_t, i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmawt2.u.i64 + l_r = __rv__kmmawt2_u(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmawt2.u + i32x2_r = __rv__v_kmmawt2_u(i32x2_t, i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmsb.i64 + l_r = __rv__kmmsb(l_t, l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmsb + i32x2_r = __rv__v_kmmsb(i32x2_t, i32x2_a, i32x2_b); + // RV64: call i64 @llvm.riscv.kmmsb.u.i64 + l_r = __rv__kmmsb_u(l_t, l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmsb.u + i32x2_r = __rv__v_kmmsb_u(i32x2_t, i32x2_a, i32x2_b); + + // RV64: call i64 @llvm.riscv.kmmwb2.i64 + l_r = __rv__kmmwb2(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmwb2 + i32x2_r = __rv__v_kmmwb2(i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmwb2.u.i64 + l_r = __rv__kmmwb2_u(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmwb2.u + i32x2_r = __rv__v_kmmwb2_u(i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmmwt2.i64 + l_r = __rv__kmmwt2(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmwt2 + i32x2_r = __rv__v_kmmwt2(i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmmwt2.u.i64 + l_r = __rv__kmmwt2_u(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmmwt2.u + i32x2_r = __rv__v_kmmwt2_u(i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kmsda.i64 + l_r = __rv__kmsda(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmsda + i32x2_r = __rv__v_kmsda(i32x2_t, i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.kmsxda.i64 + l_r = __rv__kmsxda(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.kmsxda + i32x2_r = __rv__v_kmsxda(i32x2_t, i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.ksllw.i64 + l_r = __rv__ksllw(l_a, i_b); + + // RV64: call i64 @llvm.riscv.ksll8.i64 + ul_r = __rv__ksll8(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.ksll8 + i8x8_r = __rv__v_ksll8(i8x8_a, ui_b); + + // RV64: call i64 @llvm.riscv.ksll16.i64 + ul_r = __rv__ksll16(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.ksll16 + i16x4_r = __rv__v_ksll16(i16x4_a, ui_b); + + // RV64: call i64 @llvm.riscv.kslra8.i64 + ul_r = __rv__kslra8(ul_a, i_b); + // RV64: call <8 x i8> @llvm.riscv.v.kslra8 + i8x8_r = __rv__v_kslra8(i8x8_a, i_b); + // RV64: call i64 @llvm.riscv.kslra8.u.i64 + ul_r = __rv__kslra8_u(ul_a, i_b); + // RV64: call <8 x i8> @llvm.riscv.v.kslra8.u + i8x8_r = __rv__v_kslra8_u(i8x8_a, i_b); + + // RV64: call i64 @llvm.riscv.kslra16.i64 + ul_r = __rv__kslra16(ul_a, i_b); + // RV64: call <4 x i16> @llvm.riscv.v.kslra16 + i16x4_r = __rv__v_kslra16(i16x4_a, i_b); + // RV64: call i64 @llvm.riscv.kslra16.u.i64 + ul_r = __rv__kslra16_u(ul_a, i_b); + // RV64: call <4 x i16> @llvm.riscv.v.kslra16.u + i16x4_r = __rv__v_kslra16_u(i16x4_a, i_b); + + // RV64: call i64 @llvm.riscv.kstas16.i64 + ul_r = __rv__kstas16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.kstas16 + i16x4_r = __rv__v_kstas16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.kstsa16.i64 + ul_r = __rv__kstsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.kstsa16 + i16x4_r = __rv__v_kstsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.ksub8.i64 + ul_r = __rv__ksub8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.ksub8 + i8x8_r = __rv__v_ksub8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.ksub16.i64 + ul_r = __rv__ksub16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ksub16 + i16x4_r = __rv__v_ksub16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.ksubh.i64 + l_r = __rv__ksubh(i_a, i_b); + + // RV64: call i64 @llvm.riscv.ksubw.i64 + l_r = __rv__ksubw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.kwmmul.i64 + l_r = __rv__kwmmul(l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kwmmul + i32x2_r = __rv__v_kwmmul(i32x2_a, i32x2_b); + // RV64: call i64 @llvm.riscv.kwmmul.u.i64 + l_r = __rv__kwmmul_u(l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.kwmmul.u + i32x2_r = __rv__v_kwmmul_u(i32x2_a, i32x2_b); + + // RV64: call i64 @llvm.riscv.maxw.i64 + l_r = __rv__maxw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.minw.i64 + l_r = __rv__minw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.pbsad.i64 + ul_r = __rv__pbsad(ul_a, ul_b); + // RV64: call i64 @llvm.riscv.v.pbsad + ul_r = __rv__v_pbsad(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.pbsada.i64 + ul_r = __rv__pbsada(ul_t, ul_a, ul_b); + // RV64: call i64 @llvm.riscv.v.pbsada + ul_r = __rv__v_pbsada(ul_t, u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.pkbb16.i64 + ul_r = __rv__pkbb16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.pkbb16 + u16x4_r = __rv__v_pkbb16(u16x4_a, u16x4_b); + // RV64: call i64 @llvm.riscv.pkbt16.i64 + ul_r = __rv__pkbt16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.pkbt16 + u16x4_r = __rv__v_pkbt16(u16x4_a, u16x4_b); + // RV64: call i64 @llvm.riscv.pktt16.i64 + ul_r = __rv__pktt16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.pktt16 + u16x4_r = __rv__v_pktt16(u16x4_a, u16x4_b); + // RV64: call i64 @llvm.riscv.pktb16.i64 + ul_r = __rv__pktb16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.pktb16 + u16x4_r = __rv__v_pktb16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.radd8.i64 + ul_r = __rv__radd8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.radd8 + i8x8_r = __rv__v_radd8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.radd16.i64 + ul_r = __rv__radd16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.radd16 + i16x4_r = __rv__v_radd16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.raddw.i64 + l_r = __rv__raddw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.rcras16.i64 + ul_r = __rv__rcras16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.rcras16 + i16x4_r = __rv__v_rcras16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.rcrsa16.i64 + ul_r = __rv__rcrsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.rcrsa16 + i16x4_r = __rv__v_rcrsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.rstas16.i64 + ul_r = __rv__rstas16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.rstas16 + i16x4_r = __rv__v_rstas16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.rstsa16.i64 + ul_r = __rv__rstsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.rstsa16 + i16x4_r = __rv__v_rstsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.rsub8.i64 + ul_r = __rv__rsub8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.rsub8 + i8x8_r = __rv__v_rsub8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.rsub16.i64 + ul_r = __rv__rsub16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.rsub16 + i16x4_r = __rv__v_rsub16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.rsubw.i64 + l_r = __rv__rsubw(i_a, i_b); + + // RV64: call i64 @llvm.riscv.sclip8.i64 + ul_r = __rv__sclip8(ul_a, 7); + // RV64: call <8 x i8> @llvm.riscv.v.sclip8 + i8x8_r = __rv__v_sclip8(i8x8_a, 7); + + // RV64: call i64 @llvm.riscv.sclip16.i64 + ul_r = __rv__sclip16(ul_a, 8); + // RV64: call <4 x i16> @llvm.riscv.v.sclip16 + i16x4_r = __rv__v_sclip16(i16x4_a, 8); + + // RV64: call i64 @llvm.riscv.sclip32.i64 + l_r = __rv__sclip32(l_a, 9); + // RV64: call <2 x i32> @llvm.riscv.v.sclip32 + i32x2_r = __rv__v_sclip32(i32x2_a, 9); + + // RV64: call i64 @llvm.riscv.scmple8.i64 + ul_r = __rv__scmple8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.scmple8 + u8x8_r = __rv__v_scmple8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.scmple16.i64 + ul_r = __rv__scmple16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.scmple16 + u16x4_r = __rv__v_scmple16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.scmplt8.i64 + ul_r = __rv__scmplt8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.scmplt8 + u8x8_r = __rv__v_scmplt8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.scmplt16.i64 + ul_r = __rv__scmplt16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.scmplt16 + u16x4_r = __rv__v_scmplt16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.sll8.i64 + ul_r = __rv__sll8(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.sll8 + u8x8_r = __rv__v_sll8(u8x8_a, ui_b); + + // RV64: call i64 @llvm.riscv.sll16.i64 + ul_r = __rv__sll16(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.sll16 + u16x4_r = __rv__v_sll16(u16x4_a, ui_b); + // RV64: call i64 @llvm.riscv.smaqa.i64 + l_r = __rv__smaqa(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smaqa + i32x2_r = __rv__v_smaqa(i32x2_t, i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.smaqa.su.i64 + l_r = __rv__smaqa_su(l_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smaqa.su + i32x2_r = __rv__v_smaqa_su(i32x2_t, i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.smax8.i64 + ul_r = __rv__smax8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.smax8 + i8x8_r = __rv__v_smax8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.smax16.i64 + ul_r = __rv__smax16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.smax16 + i16x4_r = __rv__v_smax16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.smbb16.i64 + l_r = __rv__smbb16(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smbb16 + i32x2_r = __rv__v_smbb16(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smbt16.i64 + l_r = __rv__smbt16(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smbt16 + i32x2_r = __rv__v_smbt16(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smtt16.i64 + l_r = __rv__smtt16(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smtt16 + i32x2_r = __rv__v_smtt16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.smds.i64 + l_r = __rv__smds(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smds + i32x2_r = __rv__v_smds(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smdrs.i64 + l_r = __rv__smdrs(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smdrs + i32x2_r = __rv__v_smdrs(i16x4_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smxds.i64 + l_r = __rv__smxds(ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smxds + i32x2_r = __rv__v_smxds(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.smin8.i64 + ul_r = __rv__smin8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.smin8 + i8x8_r = __rv__v_smin8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.smin16.i64 + ul_r = __rv__smin16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.smin16 + i16x4_r = __rv__v_smin16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.smmul.i64 + l_r = __rv__smmul(l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmul + i32x2_r = __rv__v_smmul(i32x2_a, i32x2_b); + // RV64: call i64 @llvm.riscv.smmul.u.i64 + l_r = __rv__smmul_u(l_a, l_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmul.u + i32x2_r = __rv__v_smmul_u(i32x2_a, i32x2_b); + + // RV64: call i64 @llvm.riscv.smmwb.i64 + l_r = __rv__smmwb(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmwb + i32x2_r = __rv__v_smmwb(i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smmwb.u.i64 + l_r = __rv__smmwb_u(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmwb.u + i32x2_r = __rv__v_smmwb_u(i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.smmwt.i64 + l_r = __rv__smmwt(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmwt + i32x2_r = __rv__v_smmwt(i32x2_a, i16x4_b); + // RV64: call i64 @llvm.riscv.smmwt.u.i64 + l_r = __rv__smmwt_u(l_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.smmwt.u + i32x2_r = __rv__v_smmwt_u(i32x2_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.sra.u.i64 + l_r = __rv__sra_u(l_a, ui_b); + + // RV64: call i64 @llvm.riscv.sra8.i64 + ul_r = __rv__sra8(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.sra8 + i8x8_r = __rv__v_sra8(i8x8_a, ui_b); + // RV64: call i64 @llvm.riscv.sra8.u.i64 + ul_r = __rv__sra8_u(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.sra8.u + i8x8_r = __rv__v_sra8_u(i8x8_a, ui_b); + + // RV64: call i64 @llvm.riscv.sra16.i64 + ul_r = __rv__sra16(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.sra16 + i16x4_r = __rv__v_sra16(i16x4_a, ui_b); + // RV64: call i64 @llvm.riscv.sra16.u.i64 + ul_r = __rv__sra16_u(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.sra16.u + i16x4_r = __rv__v_sra16_u(i16x4_a, ui_b); + + // RV64: call i64 @llvm.riscv.srl8.i64 + ul_r = __rv__srl8(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.srl8 + u8x8_r = __rv__v_srl8(u8x8_a, ui_b); + // RV64: call i64 @llvm.riscv.srl8.u.i64 + ul_r = __rv__srl8_u(ul_a, ui_b); + // RV64: call <8 x i8> @llvm.riscv.v.srl8.u + u8x8_r = __rv__v_srl8_u(u8x8_a, ui_b); + + // RV64: call i64 @llvm.riscv.srl16.i64 + ul_r = __rv__srl16(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.srl16 + u16x4_r = __rv__v_srl16(u16x4_a, ui_b); + // RV64: call i64 @llvm.riscv.srl16.u.i64 + ul_r = __rv__srl16_u(ul_a, ui_b); + // RV64: call <4 x i16> @llvm.riscv.v.srl16.u + u16x4_r = __rv__v_srl16_u(u16x4_a, ui_b); + + // RV64: call i64 @llvm.riscv.stas16.i64 + ul_r = __rv__stas16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.stas16 + u16x4_r = __rv__v_ustas16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.stas16 + i16x4_r = __rv__v_sstas16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.stsa16.i64 + ul_r = __rv__stsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.stsa16 + u16x4_r = __rv__v_ustsa16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.stsa16 + i16x4_r = __rv__v_sstsa16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.sub8.i64 + ul_r = __rv__sub8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.sub8 + u8x8_r = __rv__v_usub8(u8x8_a, u8x8_b); + // RV64: call <8 x i8> @llvm.riscv.v.sub8 + i8x8_r = __rv__v_ssub8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.sub16.i64 + ul_r = __rv__sub16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.sub16 + u16x4_r = __rv__v_usub16(u16x4_a, u16x4_b); + // RV64: call <4 x i16> @llvm.riscv.v.sub16 + i16x4_r = __rv__v_ssub16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.sunpkd810.i64 + ul_r = __rv__sunpkd810(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.sunpkd810 + i16x4_r = __rv__v_sunpkd810(i8x8_a); + // RV64: call i64 @llvm.riscv.sunpkd820.i64 + ul_r = __rv__sunpkd820(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.sunpkd820 + i16x4_r = __rv__v_sunpkd820(i8x8_a); + // RV64: call i64 @llvm.riscv.sunpkd830.i64 + ul_r = __rv__sunpkd830(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.sunpkd830 + i16x4_r = __rv__v_sunpkd830(i8x8_a); + // RV64: call i64 @llvm.riscv.sunpkd831.i64 + ul_r = __rv__sunpkd831(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.sunpkd831 + i16x4_r = __rv__v_sunpkd831(i8x8_a); + // RV64: call i64 @llvm.riscv.sunpkd832.i64 + ul_r = __rv__sunpkd832(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.sunpkd832 + i16x4_r = __rv__v_sunpkd832(i8x8_a); + + // RV64: call i64 @llvm.riscv.swap8.i64 + ul_r = __rv__swap8(ul_a); + // RV64: call <8 x i8> @llvm.riscv.v.swap8 + u8x8_r = __rv__v_swap8(u8x8_a); + + // RV64: call i64 @llvm.riscv.swap16.i64 + ul_r = __rv__swap16(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.swap16 + u16x4_r = __rv__v_swap16(u16x4_a); + + // RV64: call i64 @llvm.riscv.uclip8.i64 + ul_r = __rv__uclip8(ul_a, 7); + // RV64: call <8 x i8> @llvm.riscv.v.uclip8 + u8x8_r = __rv__v_uclip8(u8x8_a, 7); + + // RV64: call i64 @llvm.riscv.uclip16.i64 + ul_r = __rv__uclip16(ul_a, 8); + // RV64: call <4 x i16> @llvm.riscv.v.uclip16 + u16x4_r = __rv__v_uclip16(u16x4_a, 8); + + // RV64: call i64 @llvm.riscv.uclip32.i64 + l_r = __rv__uclip32(l_a, 9); + // RV64: call <2 x i32> @llvm.riscv.v.uclip32 + u32x2_r = __rv__v_uclip32(u32x2_a, 9); + + // RV64: call i64 @llvm.riscv.ucmple8.i64 + ul_r = __rv__ucmple8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.ucmple8 + u8x8_r = __rv__v_ucmple8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.ucmple16.i64 + ul_r = __rv__ucmple16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ucmple16 + u16x4_r = __rv__v_ucmple16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ucmplt8.i64 + ul_r = __rv__ucmplt8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.ucmplt8 + u8x8_r = __rv__v_ucmplt8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.ucmplt16.i64 + ul_r = __rv__ucmplt16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ucmplt16 + u16x4_r = __rv__v_ucmplt16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ukadd8.i64 + ul_r = __rv__ukadd8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.ukadd8 + i8x8_r = __rv__v_ukadd8(i8x8_a, i8x8_b); + + // RV64: call i64 @llvm.riscv.ukadd16.i64 + ul_r = __rv__ukadd16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ukadd16 + i16x4_r = __rv__v_ukadd16(i16x4_a, i16x4_b); + + // RV64: call i64 @llvm.riscv.ukaddh.i64 + ul_r = __rv__ukaddh(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.ukaddw.i64 + ul_r = __rv__ukaddw(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.ukcras16.i64 + ul_r = __rv__ukcras16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ukcras16 + u16x4_r = __rv__v_ukcras16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ukcrsa16.i64 + ul_r = __rv__ukcrsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ukcrsa16 + u16x4_r = __rv__v_ukcrsa16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ukstas16.i64 + ul_r = __rv__ukstas16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ukstas16 + u16x4_r = __rv__v_ukstas16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ukstsa16.i64 + ul_r = __rv__ukstsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ukstsa16 + u16x4_r = __rv__v_ukstsa16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.uksub8.i64 + ul_r = __rv__uksub8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.uksub8 + u8x8_r = __rv__v_uksub8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.uksub16.i64 + ul_r = __rv__uksub16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.uksub16 + u16x4_r = __rv__v_uksub16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.uksubh.i64 + ul_r = __rv__uksubh(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.uksubw.i64 + ul_r = __rv__uksubw(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.umaqa.i64 + ul_r = __rv__umaqa(ul_t, ul_a, ul_b); + // RV64: call <2 x i32> @llvm.riscv.v.umaqa + u32x2_r = __rv__v_umaqa(u32x2_t, u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.umax8.i64 + ul_r = __rv__umax8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.umax8 + u8x8_r = __rv__v_umax8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.umax16.i64 + ul_r = __rv__umax16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.umax16 + u16x4_r = __rv__v_umax16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.umin8.i64 + ul_r = __rv__umin8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.umin8 + u8x8_r = __rv__v_umin8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.umin16.i64 + ul_r = __rv__umin16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.umin16 + u16x4_r = __rv__v_umin16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.uradd8.i64 + ul_r = __rv__uradd8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.uradd8 + u8x8_r = __rv__v_uradd8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.uradd16.i64 + ul_r = __rv__uradd16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.uradd16 + u16x4_r = __rv__v_uradd16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.uraddw.i64 + ul_r = __rv__uraddw(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.urcras16.i64 + ul_r = __rv__urcras16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.urcras16 + u16x4_r = __rv__v_urcras16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.urcrsa16.i64 + ul_r = __rv__urcrsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.urcrsa16 + u16x4_r = __rv__v_urcrsa16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.urstas16.i64 + ul_r = __rv__urstas16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.urstas16 + u16x4_r = __rv__v_urstas16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.urstsa16.i64 + ul_r = __rv__urstsa16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.urstsa16 + u16x4_r = __rv__v_urstsa16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ursub8.i64 + ul_r = __rv__ursub8(ul_a, ul_b); + // RV64: call <8 x i8> @llvm.riscv.v.ursub8 + u8x8_r = __rv__v_ursub8(u8x8_a, u8x8_b); + + // RV64: call i64 @llvm.riscv.ursub16.i64 + ul_r = __rv__ursub16(ul_a, ul_b); + // RV64: call <4 x i16> @llvm.riscv.v.ursub16 + u16x4_r = __rv__v_ursub16(u16x4_a, u16x4_b); + + // RV64: call i64 @llvm.riscv.ursubw.i64 + ul_r = __rv__ursubw(ui_a, ui_b); + + // RV64: call i64 @llvm.riscv.zunpkd810.i64 + ul_r = __rv__zunpkd810(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.zunpkd810 + u16x4_r = __rv__v_zunpkd810(u8x8_a); + // RV64: call i64 @llvm.riscv.zunpkd820.i64 + ul_r = __rv__zunpkd820(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.zunpkd820 + u16x4_r = __rv__v_zunpkd820(u8x8_a); + // RV64: call i64 @llvm.riscv.zunpkd830.i64 + ul_r = __rv__zunpkd830(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.zunpkd830 + u16x4_r = __rv__v_zunpkd830(u8x8_a); + // RV64: call i64 @llvm.riscv.zunpkd831.i64 + ul_r = __rv__zunpkd831(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.zunpkd831 + u16x4_r = __rv__v_zunpkd831(u8x8_a); + // RV64: call i64 @llvm.riscv.zunpkd832.i64 + ul_r = __rv__zunpkd832(ul_a); + // RV64: call <4 x i16> @llvm.riscv.v.zunpkd832 + u16x4_r = __rv__v_zunpkd832(u8x8_a); +} diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1136,3 +1136,325 @@ } } // TargetPrefix = "riscv" + +//===----------------------------------------------------------------------===// +// Packing SIMD + +let TargetPrefix = "riscv" in { +class RISCVUnaryScalar + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVUnaryScalar { + def "int_riscv_" # NAME : RISCVUnaryScalar; +} + +defm kabsw : RISCVUnaryScalar; + +class RISCVUnaryVector + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVUnary { + def "int_riscv_" # NAME : RISCVUnaryScalar; + def "int_riscv_v_" # NAME : RISCVUnaryVector; +} + +defm clrs8 : RISCVUnary; +defm clrs16 : RISCVUnary; +defm clrs32 : RISCVUnary; +defm clo8 : RISCVUnary; +defm clo16 : RISCVUnary; +defm clo32 : RISCVUnary; +defm clz8 : RISCVUnary; +defm clz16 : RISCVUnary; +defm clz32 : RISCVUnary; +defm kabs8 : RISCVUnary; +defm kabs16 : RISCVUnary; +defm swap8 : RISCVUnary; +defm swap16 : RISCVUnary; + +class RISCVUnaryVectorPKD + : Intrinsic<[llvm_anyvector_ty], + [LLVMSubdivide2VectorType<0>], + [IntrNoMem]>; + +multiclass RISCVUnaryPKD { + def "int_riscv_" # NAME : RISCVUnaryScalar; + def "int_riscv_v_" # NAME : RISCVUnaryVectorPKD; +} + +defm sunpkd810 : RISCVUnaryPKD; +defm sunpkd820 : RISCVUnaryPKD; +defm sunpkd830 : RISCVUnaryPKD; +defm sunpkd831 : RISCVUnaryPKD; +defm sunpkd832 : RISCVUnaryPKD; +defm zunpkd810 : RISCVUnaryPKD; +defm zunpkd820 : RISCVUnaryPKD; +defm zunpkd830 : RISCVUnaryPKD; +defm zunpkd831 : RISCVUnaryPKD; +defm zunpkd832 : RISCVUnaryPKD; + +class RISCVBinaryScalar + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVBinaryScalar { + def "int_riscv_" # NAME : RISCVBinaryScalar; +} + +defm ave : RISCVBinaryScalar; +defm bitrev : RISCVBinaryScalar; +defm kaddh : RISCVBinaryScalar; +defm kaddw : RISCVBinaryScalar; +defm ksllw : RISCVBinaryScalar; +defm ksubh : RISCVBinaryScalar; +defm ksubw : RISCVBinaryScalar; +defm maxw : RISCVBinaryScalar; +defm minw : RISCVBinaryScalar; +defm raddw : RISCVBinaryScalar; +defm rsubw : RISCVBinaryScalar; +defm sra_u : RISCVBinaryScalar; +defm ukaddh : RISCVBinaryScalar; +defm ukaddw : RISCVBinaryScalar; +defm uksubh : RISCVBinaryScalar; +defm uksubw : RISCVBinaryScalar; +defm uraddw : RISCVBinaryScalar; +defm ursubw : RISCVBinaryScalar; + +class RISCVBinaryVector + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVBinary { + def "int_riscv_" # NAME : RISCVBinaryScalar; + def "int_riscv_v_" # NAME : RISCVBinaryVector; +} + +defm add8 : RISCVBinary; +defm add16 : RISCVBinary; +defm cmpeq8 : RISCVBinary; +defm cmpeq16 : RISCVBinary; +defm cras16 : RISCVBinary; +defm crsa16 : RISCVBinary; +defm kadd8 : RISCVBinary; +defm kadd16 : RISCVBinary; +defm kcras16 : RISCVBinary; +defm kcrsa16 : RISCVBinary; +defm khm8 : RISCVBinary; +defm khmx8 : RISCVBinary; +defm khm16 : RISCVBinary; +defm khmx16 : RISCVBinary; +defm kstas16 : RISCVBinary; +defm kstsa16 : RISCVBinary; +defm ksub8 : RISCVBinary; +defm ksub16 : RISCVBinary; +defm kwmmul : RISCVBinary; +defm kwmmul_u : RISCVBinary; +defm pkbb16 : RISCVBinary; +defm pkbt16 : RISCVBinary; +defm pktt16 : RISCVBinary; +defm pktb16 : RISCVBinary; +defm radd8 : RISCVBinary; +defm radd16 : RISCVBinary; +defm rcras16 : RISCVBinary; +defm rcrsa16 : RISCVBinary; +defm rstas16 : RISCVBinary; +defm rstsa16 : RISCVBinary; +defm rsub8 : RISCVBinary; +defm rsub16 : RISCVBinary; +defm scmple8 : RISCVBinary; +defm scmple16 : RISCVBinary; +defm scmplt8 : RISCVBinary; +defm scmplt16 : RISCVBinary; +defm smax8 : RISCVBinary; +defm smax16 : RISCVBinary; +defm smin8 : RISCVBinary; +defm smin16 : RISCVBinary; +defm smmul : RISCVBinary; +defm smmul_u : RISCVBinary; +defm stas16 : RISCVBinary; +defm stsa16 : RISCVBinary; +defm sub8 : RISCVBinary; +defm sub16 : RISCVBinary; +defm ucmple8 : RISCVBinary; +defm ucmple16 : RISCVBinary; +defm ucmplt8 : RISCVBinary; +defm ucmplt16 : RISCVBinary; +defm ukadd8 : RISCVBinary; +defm ukadd16 : RISCVBinary; +defm ukcras16 : RISCVBinary; +defm ukcrsa16 : RISCVBinary; +defm ukstas16 : RISCVBinary; +defm ukstsa16 : RISCVBinary; +defm uksub8 : RISCVBinary; +defm uksub16 : RISCVBinary; +defm umax8 : RISCVBinary; +defm umax16 : RISCVBinary; +defm umin8 : RISCVBinary; +defm umin16 : RISCVBinary; +defm uradd8 : RISCVBinary; +defm uradd16 : RISCVBinary; +defm urcras16 : RISCVBinary; +defm urcrsa16 : RISCVBinary; +defm urstas16 : RISCVBinary; +defm urstsa16 : RISCVBinary; +defm ursub8 : RISCVBinary; +defm ursub16 : RISCVBinary; + +class RISCVBinaryVectorABB + : Intrinsic<[llvm_any_ty], + [llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; + +multiclass RISCVBinaryABB { + def "int_riscv_" # NAME : RISCVBinaryScalar; + def "int_riscv_v_" # NAME : RISCVBinaryVectorABB; +} + +defm kdmbb : RISCVBinaryABB; +defm kdmbt : RISCVBinaryABB; +defm kdmtt : RISCVBinaryABB; +defm khmbb : RISCVBinaryABB; +defm khmbt : RISCVBinaryABB; +defm khmtt : RISCVBinaryABB; +defm kmda : RISCVBinaryABB; +defm kmxda : RISCVBinaryABB; +defm pbsad : RISCVBinaryABB; +defm smbb16 : RISCVBinaryABB; +defm smbt16 : RISCVBinaryABB; +defm smtt16 : RISCVBinaryABB; +defm smds : RISCVBinaryABB; +defm smdrs : RISCVBinaryABB; +defm smxds : RISCVBinaryABB; + +class RISCVBinaryVectorAAB + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_any_ty], + [IntrNoMem]>; + +multiclass RISCVBinaryAAB { + def "int_riscv_" # NAME : RISCVBinaryScalar; + def "int_riscv_v_" # NAME : RISCVBinaryVectorAAB; +} + +defm kmmwb2 : RISCVBinaryAAB; +defm kmmwb2_u : RISCVBinaryAAB; +defm kmmwt2 : RISCVBinaryAAB; +defm kmmwt2_u : RISCVBinaryAAB; +defm smmwb : RISCVBinaryAAB; +defm smmwb_u : RISCVBinaryAAB; +defm smmwt : RISCVBinaryAAB; +defm smmwt_u : RISCVBinaryAAB; + +class RISCVBinaryVectorShift + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_anyint_ty], + [IntrNoMem]>; + +multiclass RISCVBinaryShift { + def "int_riscv_" # NAME : RISCVBinaryScalar; + def "int_riscv_v_" # NAME : RISCVBinaryVectorShift; +} + +defm ksll8 : RISCVBinaryShift; +defm ksll16 : RISCVBinaryShift; +defm kslra8 : RISCVBinaryShift; +defm kslra8_u : RISCVBinaryShift; +defm kslra16 : RISCVBinaryShift; +defm kslra16_u : RISCVBinaryShift; +defm sclip8 : RISCVBinaryShift; +defm sclip16 : RISCVBinaryShift; +defm sclip32 : RISCVBinaryShift; +defm sll8 : RISCVBinaryShift; +defm sll16 : RISCVBinaryShift; +defm sra8 : RISCVBinaryShift; +defm sra8_u : RISCVBinaryShift; +defm sra16 : RISCVBinaryShift; +defm sra16_u : RISCVBinaryShift; +defm srl8 : RISCVBinaryShift; +defm srl8_u : RISCVBinaryShift; +defm srl16 : RISCVBinaryShift; +defm srl16_u : RISCVBinaryShift; +defm uclip8 : RISCVBinaryShift; +defm uclip16 : RISCVBinaryShift; +defm uclip32 : RISCVBinaryShift; + +class RISCVTernaryScalar + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVTernaryScalar { + def "int_riscv_" # NAME : RISCVTernaryScalar; +} + +defm bpick : RISCVTernaryScalar; +defm insb : RISCVTernaryScalar; + +class RISCVTernaryVector + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +multiclass RISCVTernary { + def "int_riscv_" # NAME : RISCVTernaryScalar; + def "int_riscv_v_" # NAME : RISCVTernaryVector; +} + +defm kmmac : RISCVTernary; +defm kmmac_u : RISCVTernary; +defm kmmsb : RISCVTernary; +defm kmmsb_u : RISCVTernary; + +class RISCVTernaryVectorAABB + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<1>], + [IntrNoMem]>; + +multiclass RISCVTernaryAABB { + def "int_riscv_" # NAME : RISCVTernaryScalar; + def "int_riscv_v_" # NAME : RISCVTernaryVectorAABB; +} + +defm kdmabb : RISCVTernaryAABB; +defm kdmabt : RISCVTernaryAABB; +defm kdmatt : RISCVTernaryAABB; +defm kmabb : RISCVTernaryAABB; +defm kmabt : RISCVTernaryAABB; +defm kmatt : RISCVTernaryAABB; +defm kmada : RISCVTernaryAABB; +defm kmaxda : RISCVTernaryAABB; +defm kmads : RISCVTernaryAABB; +defm kmadrs : RISCVTernaryAABB; +defm kmaxds : RISCVTernaryAABB; +defm kmsda : RISCVTernaryAABB; +defm kmsxda : RISCVTernaryAABB; +defm pbsada : RISCVTernaryAABB; +defm smaqa : RISCVTernaryAABB; +defm smaqa_su : RISCVTernaryAABB; +defm umaqa : RISCVTernaryAABB; + +class RISCVTernaryVectorAAAB + : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty], + [IntrNoMem]>; + +multiclass RISCVTernaryAAAB { + def "int_riscv_" # NAME : RISCVTernaryScalar; + def "int_riscv_v_" # NAME : RISCVTernaryVectorAAAB; +} + +defm kmmawb : RISCVTernaryAAAB; +defm kmmawb_u : RISCVTernaryAAAB; +defm kmmawb2 : RISCVTernaryAAAB; +defm kmmawb2_u : RISCVTernaryAAAB; +defm kmmawt : RISCVTernaryAAAB; +defm kmmawt_u : RISCVTernaryAAAB; +defm kmmawt2 : RISCVTernaryAAAB; +defm kmmawt2_u : RISCVTernaryAAAB; +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -168,6 +168,17 @@ } } + if (Subtarget.hasStdExtP()) { + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v8i8, &RISCV::GPRPRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRPRegClass); + addRegisterClass(MVT::v2i32, &RISCV::GPRPRegClass); + } else { + addRegisterClass(MVT::v4i8, &RISCV::GPRPRegClass); + addRegisterClass(MVT::v2i16, &RISCV::GPRPRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -686,6 +697,41 @@ } } + if (Subtarget.hasStdExtP()) { + const auto addTypeForP = [&](MVT VT, MVT PromotedBitwiseVT) { + // Expand all builtin opcodes. + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + + setOperationAction(ISD::BITCAST, VT, Legal); + + // Promote load and store operations. + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, PromotedBitwiseVT); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, PromotedBitwiseVT); + }; + + if (Subtarget.is64Bit()) { + addTypeForP(MVT::v8i8, MVT::i64); + addTypeForP(MVT::v4i16, MVT::i64); + addTypeForP(MVT::v2i32, MVT::i64); + } else { + addTypeForP(MVT::v4i8, MVT::i32); + addTypeForP(MVT::v2i16, MVT::i32); + } + + // Expand all truncating stores and extending loads. + for (MVT VT0 : MVT::vector_valuetypes()) { + for (MVT VT1 : MVT::vector_valuetypes()) { + setTruncStoreAction(VT0, VT1, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand); + setLoadExtAction(ISD::EXTLOAD, VT0, VT1, Expand); + } + } + } + // Function alignments. const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4); setMinFunctionAlignment(FunctionAlignment); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -944,3 +944,498 @@ def : InstAlias<"rdov $rd", (CSRRS GPR:$rd, 0x009, X0)>; def : InstAlias<"clrov", (CSRRCI X0, 0x009, 1)>; } + +//===----------------------------------------------------------------------===// +// Intrinsics codegen patterns +//===----------------------------------------------------------------------===// + +class RVPBitconvertPat + : Pat<(DstVT (bitconvert (SrcVT SrcRC:$src))), + (COPY_TO_REGCLASS SrcRC:$src, DstRC)>; + +let Predicates = [HasStdExtP] in { +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; + +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +def : RVPBitconvertPat; +} + +// Unary operation +class RVPUnaryScalarPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) XLenVT:$rs1)), + (Inst GPR:$rs1)>; + +let Predicates = [HasStdExtP] in { +def : RVPUnaryScalarPat; +} + +class RVPUnaryVector8Pat + : Pat<(XLenI8VT (!cast("int_riscv_" # IntID) XLenI8VT:$rs1)), + (Inst GPRP:$rs1)>; + +multiclass RVPUnary8Pat { + def : RVPUnaryScalarPat; + def : RVPUnaryVector8Pat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPUnary8Pat; +defm : RVPUnary8Pat; +defm : RVPUnary8Pat; +defm : RVPUnary8Pat; +defm : RVPUnary8Pat; +} + +class RVPUnaryVector8PKDPat + : Pat<(XLenI16VT (!cast("int_riscv_" # IntID) XLenI8VT:$rs1)), + (Inst GPRP:$rs1)>; + +multiclass RVPUnary8PKDPat { + def : RVPUnaryScalarPat; + def : RVPUnaryVector8PKDPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +defm : RVPUnary8PKDPat; +} + +class RVPUnaryVector16Pat + : Pat<(XLenI16VT (!cast("int_riscv_" # IntID) XLenI16VT:$rs1)), + (Inst GPRP:$rs1)>; + +multiclass RVPUnary16Pat { + def : RVPUnaryScalarPat; + def : RVPUnaryVector16Pat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPUnary16Pat; +defm : RVPUnary16Pat; +defm : RVPUnary16Pat; +defm : RVPUnary16Pat; +defm : RVPUnary16Pat; +} + +class RVPUnaryVector32Pat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) XLenI32VT:$rs1)), + (Inst GPRP:$rs1)>; + +multiclass RVPUnary32Pat { + let Predicates = [HasStdExtP] in + def : RVPUnaryScalarPat; + let Predicates = [HasStdExtP, IsRV64] in + def : RVPUnaryVector32Pat; +} + +defm : RVPUnary32Pat; +defm : RVPUnary32Pat; +defm : RVPUnary32Pat; + +// Binary operation +class RVPBinaryScalarPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) XLenVT:$rs1, XLenVT:$rs2)), + (Inst GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtP] in { +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +def : RVPBinaryScalarPat; +} + +class RVPBinaryVector8Pat + : Pat<(XLenI8VT (!cast("int_riscv_" # IntID) + XLenI8VT:$rs1, XLenI8VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary8Pat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector8Pat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +defm : RVPBinary8Pat; +} + +class RVPBinaryVector8PBSADPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) + XLenI8VT:$rs1, XLenI8VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary8PBSADPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector8PBSADPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary8PBSADPat; +} + +class RVPBinaryVector8ShiftPat + : Pat<(XLenI8VT (!cast("int_riscv_" # IntID) + XLenI8VT:$rs1, XLenVT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary8ShiftPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector8ShiftPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +defm : RVPBinary8ShiftPat; +} + +class RVPBinaryVector16Pat + : Pat<(XLenI16VT (!cast("int_riscv_" # IntID) + XLenI16VT:$rs1, XLenI16VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary16Pat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector16Pat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +defm : RVPBinary16Pat; +} + +class RVPBinaryVector16KMPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) + XLenI16VT:$rs1, XLenI16VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary16KMPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector16KMPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary16KMPat; +defm : RVPBinary16KMPat; +defm : RVPBinary16KMPat; +defm : RVPBinary16KMPat; +defm : RVPBinary16KMPat; +defm : RVPBinary16KMPat; +} + +class RVPBinaryVector16ShiftPat + : Pat<(XLenI16VT (!cast("int_riscv_" # IntID) + XLenI16VT:$rs1, XLenVT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary16ShiftPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector16ShiftPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +defm : RVPBinary16ShiftPat; +} + +class RVPBinaryVector32Pat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI32VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary32Pat { + let Predicates = [HasStdExtP] in + def : RVPBinaryScalarPat; + let Predicates = [HasStdExtP, IsRV64] in + def : RVPBinaryVector32Pat; +} + +defm : RVPBinary32Pat; +defm : RVPBinary32Pat; +defm : RVPBinary32Pat; +defm : RVPBinary32Pat; + +class RVPBinaryVector32KMPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI16VT:$rs1, XLenI16VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary32KMPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector32KMPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +defm : RVPBinary32KMPat; +} + +class RVPBinaryVector32MMWPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI16VT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary32MMWPat { + def : RVPBinaryScalarPat; + def : RVPBinaryVector32MMWPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +defm : RVPBinary32MMWPat; +} + +class RVPBinaryVector32ShiftPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenVT:$rs2)), + (Inst GPRP:$rs1, GPRP:$rs2)>; + +multiclass RVPBinary32ShiftPat { + let Predicates = [HasStdExtP] in + def : RVPBinaryScalarPat; + let Predicates = [HasStdExtP, IsRV64] in + def : RVPBinaryVector32ShiftPat; +} + +defm : RVPBinary32ShiftPat; +defm : RVPBinary32ShiftPat; + +// Ternary operation +class RVPTernaryScalarPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) + XLenVT:$rs1, XLenVT:$rs2, XLenVT:$rs3)), + (Inst GPR:$rs1, GPR:$rs2, GPR:$rs3)>; + +let Predicates = [HasStdExtP] in { +def : RVPTernaryScalarPat; +def : RVPTernaryScalarPat; +} + +class RVPTernaryVector8PBSADAPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) + XLenVT:$rs1, XLenI8VT:$rs2, XLenI8VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary8PBSADAPat { + def : RVPTernaryScalarPat; + def : RVPTernaryVector8PBSADAPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPTernary8PBSADAPat; +} + +class RVPTernaryVector8MAQAPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI8VT:$rs2, XLenI8VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary8MAQAPat { + def : RVPTernaryScalarPat; + def : RVPTernaryVector8MAQAPat; +} + +defm : RVPTernary8MAQAPat; +defm : RVPTernary8MAQAPat; +defm : RVPTernary8MAQAPat; + +class RVPTernaryVector16KMPat + : Pat<(XLenVT (!cast("int_riscv_" # IntID) + XLenVT:$rs1, XLenI16VT:$rs2, XLenI16VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary16KMPat { + def : RVPTernaryScalarPat; + def : RVPTernaryVector16KMPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPTernary16KMPat; +defm : RVPTernary16KMPat; +defm : RVPTernary16KMPat; +defm : RVPTernary16KMPat; +defm : RVPTernary16KMPat; +defm : RVPTernary16KMPat; +} + +class RVPTernaryVector32Pat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI32VT:$rs2, XLenI32VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary32Pat { + let Predicates = [HasStdExtP] in + def : RVPTernaryScalarPat; + let Predicates = [HasStdExtP, IsRV64] in + def : RVPTernaryVector32Pat; +} + +defm : RVPTernary32Pat; +defm : RVPTernary32Pat; +defm : RVPTernary32Pat; +defm : RVPTernary32Pat; + +class RVPTernaryVector32KMPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI16VT:$rs2, XLenI16VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary32KMPat { + def : RVPTernaryScalarPat; + def : RVPTernaryVector32KMPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +defm : RVPTernary32KMPat; +} + +class RVPTernaryVector32KMMPat + : Pat<(XLenI32VT (!cast("int_riscv_" # IntID) + XLenI32VT:$rs1, XLenI32VT:$rs2, XLenI16VT:$rs3)), + (Inst GPRP:$rs1, GPRP:$rs2, GPRP:$rs3)>; + +multiclass RVPTernary32KMMPat { + def : RVPTernaryScalarPat; + def : RVPTernaryVector32KMMPat; +} + +let Predicates = [HasStdExtP] in { +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +defm : RVPTernary32KMMPat; +} diff --git a/llvm/test/CodeGen/RISCV/intrinsics-rv32p.ll b/llvm/test/CodeGen/RISCV/intrinsics-rv32p.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/intrinsics-rv32p.ll @@ -0,0 +1,3626 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32P %s + +define void @test() nounwind { +; RV32P-LABEL: test: +; RV32P: # %bb.0: # %entry +; RV32P-NEXT: addi sp, sp, -224 +; RV32P-NEXT: sw zero, 220(sp) +; RV32P-NEXT: sw zero, 216(sp) +; RV32P-NEXT: addi a1, zero, 1 +; RV32P-NEXT: sw a1, 212(sp) +; RV32P-NEXT: sw zero, 204(sp) +; RV32P-NEXT: sw a1, 200(sp) +; RV32P-NEXT: addi a2, zero, 2 +; RV32P-NEXT: sw a2, 196(sp) +; RV32P-NEXT: sw zero, 188(sp) +; RV32P-NEXT: sw a1, 184(sp) +; RV32P-NEXT: sw a2, 180(sp) +; RV32P-NEXT: sw zero, 172(sp) +; RV32P-NEXT: sw a1, 168(sp) +; RV32P-NEXT: sw a2, 164(sp) +; RV32P-NEXT: addi a0, zero, 3 +; RV32P-NEXT: sw a0, 160(sp) +; RV32P-NEXT: sw zero, 148(sp) +; RV32P-NEXT: sw zero, 144(sp) +; RV32P-NEXT: sw zero, 140(sp) +; RV32P-NEXT: sw a1, 136(sp) +; RV32P-NEXT: sw zero, 132(sp) +; RV32P-NEXT: sw a2, 128(sp) +; RV32P-NEXT: sw zero, 116(sp) +; RV32P-NEXT: sw zero, 112(sp) +; RV32P-NEXT: sw zero, 108(sp) +; RV32P-NEXT: lui a3, %hi(.LCPI0_0) +; RV32P-NEXT: lw a3, %lo(.LCPI0_0)(a3) +; RV32P-NEXT: sw a1, 104(sp) +; RV32P-NEXT: sw zero, 100(sp) +; RV32P-NEXT: sw a2, 96(sp) +; RV32P-NEXT: sw a3, 84(sp) +; RV32P-NEXT: lui a1, %hi(.LCPI0_1) +; RV32P-NEXT: lw a1, %lo(.LCPI0_1)(a1) +; RV32P-NEXT: sw a3, 80(sp) +; RV32P-NEXT: sw a3, 72(sp) +; RV32P-NEXT: sw a3, 68(sp) +; RV32P-NEXT: sw a1, 60(sp) +; RV32P-NEXT: sw a1, 56(sp) +; RV32P-NEXT: sw a1, 48(sp) +; RV32P-NEXT: sw a1, 44(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: add8 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: lw a2, 68(sp) +; RV32P-NEXT: add8 a1, a1, a2 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 84(sp) +; RV32P-NEXT: lw a2, 80(sp) +; RV32P-NEXT: add8 a1, a1, a2 +; RV32P-NEXT: sw a1, 76(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: add16 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: lw a2, 44(sp) +; RV32P-NEXT: add16 a1, a1, a2 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: add16 a1, a1, a2 +; RV32P-NEXT: sw a1, 52(sp) +; RV32P-NEXT: lw a1, 184(sp) +; RV32P-NEXT: lw a2, 180(sp) +; RV32P-NEXT: ave a1, a1, a2 +; RV32P-NEXT: sw a1, 176(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: bitrev a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: lw a3, 160(sp) +; RV32P-NEXT: bpick a1, a1, a2, a3 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clrs8 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 84(sp) +; RV32P-NEXT: clrs8 a1, a1 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clrs16 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: clrs16 a1, a1 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clrs32 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clo8 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: clo8 a1, a1 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clo16 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: clo16 a1, a1 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clo32 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clz8 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: clz8 a1, a1 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clz16 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: clz16 a1, a1 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: clz32 a1, a1 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: cmpeq8 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 84(sp) +; RV32P-NEXT: lw a2, 80(sp) +; RV32P-NEXT: cmpeq8 a1, a1, a2 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: lw a2, 68(sp) +; RV32P-NEXT: cmpeq8 a1, a1, a2 +; RV32P-NEXT: sw a1, 64(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: cmpeq16 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: cmpeq16 a1, a1, a2 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: lw a2, 44(sp) +; RV32P-NEXT: cmpeq16 a1, a1, a2 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: cras16 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: lw a2, 44(sp) +; RV32P-NEXT: cras16 a1, a1, a2 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: cras16 a1, a1, a2 +; RV32P-NEXT: sw a1, 52(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: crsa16 a1, a1, a2 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 48(sp) +; RV32P-NEXT: lw a2, 44(sp) +; RV32P-NEXT: crsa16 a1, a1, a2 +; RV32P-NEXT: sw a1, 40(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: crsa16 a1, a1, a2 +; RV32P-NEXT: sw a1, 52(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: insb a1, a2, a0 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: kabs8 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: kabs8 a0, a0 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: kabs16 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: kabs16 a0, a0 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: kabsw a0, a0 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kadd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: kadd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kadd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kadd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kaddh a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kaddw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: kdmbb a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kdmbb a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: kdmbt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kdmbt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: kdmtt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kdmtt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 200(sp) +; RV32P-NEXT: lw a2, 196(sp) +; RV32P-NEXT: kdmabb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kdmabb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 200(sp) +; RV32P-NEXT: lw a2, 196(sp) +; RV32P-NEXT: kdmabt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kdmabt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 200(sp) +; RV32P-NEXT: lw a2, 196(sp) +; RV32P-NEXT: kdmatt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kdmatt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: khm8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: khm8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: khmx8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: khmx8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: khm16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: khm16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: khmx16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: khmx16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: khmbb a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: khmbb a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: khmbt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: khmbt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: khmtt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: khmtt a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmabb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmabb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmabt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmabt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmatt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmatt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmada a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmada a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmaxda a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmaxda a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmads a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmads a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmadrs a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmadrs a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmaxds a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmaxds a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmda a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmda a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmxda a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmxda a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 184(sp) +; RV32P-NEXT: lw a2, 180(sp) +; RV32P-NEXT: kmmac a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 184(sp) +; RV32P-NEXT: lw a2, 180(sp) +; RV32P-NEXT: kmmac.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawb a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawb.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawb.u a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawb2 a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawb2 a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawb2.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawb2.u a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawt a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawt a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawt.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawt.u a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawt2 a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawt2 a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmmawt2.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 216(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmmawt2.u a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 184(sp) +; RV32P-NEXT: lw a2, 180(sp) +; RV32P-NEXT: kmmsb a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 184(sp) +; RV32P-NEXT: lw a2, 180(sp) +; RV32P-NEXT: kmmsb.u a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmmwb2 a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmmwb2 a0, a0, a1 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmmwb2.u a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmmwb2.u a0, a0, a1 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmmwt2 a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmmwt2 a0, a0, a1 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kmmwt2.u a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kmmwt2.u a0, a0, a1 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmsda a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmsda a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 188(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: kmsxda a0, a1, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 220(sp) +; RV32P-NEXT: lw a1, 60(sp) +; RV32P-NEXT: lw a2, 56(sp) +; RV32P-NEXT: kmsxda a0, a1, a2 +; RV32P-NEXT: sw a0, 208(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: ksllw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: ksll8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: ksll8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: ksll16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: ksll16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra8.u a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra8.u a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra16.u a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: kslra16.u a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kstas16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kstas16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: kstsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: kstsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ksub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: ksub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ksub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: ksub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: ksubh a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: ksubw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 180(sp) +; RV32P-NEXT: kwmmul a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: lw a1, 180(sp) +; RV32P-NEXT: kwmmul.u a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: maxw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: minw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: pbsad a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: pbsad a0, a0, a1 +; RV32P-NEXT: sw a0, 192(sp) +; RV32P-NEXT: lw a0, 172(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: pbsada a0, a1, a2 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 172(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: lw a2, 68(sp) +; RV32P-NEXT: pbsada a0, a1, a2 +; RV32P-NEXT: sw a0, 192(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: pkbb16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: pkbb16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: pkbt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: pkbt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: pktt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: pktt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: pktb16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: pktb16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: radd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: radd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: radd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: radd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: raddw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: rcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: rcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: rcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: rcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rsub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: rsub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: rsub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: rsub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: rsubw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: addi a0, zero, 5 +; RV32P-NEXT: sclip8 a1, a1, a0 +; RV32P-NEXT: sw a1, 156(sp) +; RV32P-NEXT: lw a1, 84(sp) +; RV32P-NEXT: sclip8 a1, a1, a0 +; RV32P-NEXT: sw a1, 76(sp) +; RV32P-NEXT: lw a2, 168(sp) +; RV32P-NEXT: addi a1, zero, 6 +; RV32P-NEXT: sclip16 a2, a2, a1 +; RV32P-NEXT: sw a2, 156(sp) +; RV32P-NEXT: lw a2, 60(sp) +; RV32P-NEXT: sclip16 a2, a2, a1 +; RV32P-NEXT: sw a2, 52(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: addi a2, zero, 7 +; RV32P-NEXT: sclip32 a3, a3, a2 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: scmple8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 80(sp) +; RV32P-NEXT: scmple8 a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: scmple16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: scmple16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: scmplt8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 80(sp) +; RV32P-NEXT: scmplt8 a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: scmplt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: scmplt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sll8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sll8 a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sll16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sll16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 188(sp) +; RV32P-NEXT: lw a4, 168(sp) +; RV32P-NEXT: lw a5, 164(sp) +; RV32P-NEXT: smaqa a3, a4, a5 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 220(sp) +; RV32P-NEXT: lw a4, 84(sp) +; RV32P-NEXT: lw a5, 80(sp) +; RV32P-NEXT: smaqa a3, a4, a5 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 188(sp) +; RV32P-NEXT: lw a4, 168(sp) +; RV32P-NEXT: lw a5, 164(sp) +; RV32P-NEXT: smaqa.su a3, a4, a5 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 220(sp) +; RV32P-NEXT: lw a4, 84(sp) +; RV32P-NEXT: lw a5, 80(sp) +; RV32P-NEXT: smaqa.su a3, a4, a5 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smax8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 80(sp) +; RV32P-NEXT: smax8 a3, a3, a4 +; RV32P-NEXT: sw a3, 76(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smax16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smax16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smbb16 a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smbb16 a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smbt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smbt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smtt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smtt16 a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smds a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smds a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smdrs a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smdrs a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smxds a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smxds a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smin8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 80(sp) +; RV32P-NEXT: smin8 a3, a3, a4 +; RV32P-NEXT: sw a3, 76(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smin16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smin16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 180(sp) +; RV32P-NEXT: smmul a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 180(sp) +; RV32P-NEXT: smmul.u a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smmwb a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 216(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smmwb a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smmwb.u a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 216(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smmwb.u a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smmwt a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 216(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smmwt a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: smmwt.u a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 216(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: smmwt.u a3, a3, a4 +; RV32P-NEXT: sw a3, 208(sp) +; RV32P-NEXT: lw a3, 184(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra.u a3, a3, a4 +; RV32P-NEXT: sw a3, 176(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra8 a3, a3, a4 +; RV32P-NEXT: sw a3, 76(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra8.u a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra8.u a3, a3, a4 +; RV32P-NEXT: sw a3, 76(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra16.u a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: sra16.u a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl8 a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl8.u a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl8.u a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl16.u a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 196(sp) +; RV32P-NEXT: srl16.u a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: stas16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 44(sp) +; RV32P-NEXT: stas16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: stas16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: stsa16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 44(sp) +; RV32P-NEXT: stsa16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: stsa16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: sub8 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: lw a4, 68(sp) +; RV32P-NEXT: sub8 a3, a3, a4 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: lw a4, 80(sp) +; RV32P-NEXT: sub8 a3, a3, a4 +; RV32P-NEXT: sw a3, 76(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: lw a4, 164(sp) +; RV32P-NEXT: sub16 a3, a3, a4 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: lw a4, 44(sp) +; RV32P-NEXT: sub16 a3, a3, a4 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 60(sp) +; RV32P-NEXT: lw a4, 56(sp) +; RV32P-NEXT: sub16 a3, a3, a4 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: sunpkd810 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: sunpkd810 a3, a3 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: sunpkd820 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: sunpkd820 a3, a3 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: sunpkd830 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: sunpkd830 a3, a3 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: sunpkd831 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: sunpkd831 a3, a3 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: sunpkd832 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 84(sp) +; RV32P-NEXT: sunpkd832 a3, a3 +; RV32P-NEXT: sw a3, 52(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: swap8 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: swap8 a3, a3 +; RV32P-NEXT: sw a3, 64(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: swap16 a3, a3 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 48(sp) +; RV32P-NEXT: swap16 a3, a3 +; RV32P-NEXT: sw a3, 40(sp) +; RV32P-NEXT: lw a3, 168(sp) +; RV32P-NEXT: uclip8 a3, a3, a0 +; RV32P-NEXT: sw a3, 156(sp) +; RV32P-NEXT: lw a3, 72(sp) +; RV32P-NEXT: uclip8 a0, a3, a0 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: uclip16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: uclip16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 184(sp) +; RV32P-NEXT: uclip32 a0, a0, a2 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ucmple8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: ucmple8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ucmple16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ucmple16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ucmplt8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: ucmplt8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ucmplt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ucmplt16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukadd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 84(sp) +; RV32P-NEXT: lw a1, 80(sp) +; RV32P-NEXT: ukadd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 76(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukadd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 60(sp) +; RV32P-NEXT: lw a1, 56(sp) +; RV32P-NEXT: ukadd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 52(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: ukaddh a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 216(sp) +; RV32P-NEXT: lw a1, 212(sp) +; RV32P-NEXT: ukaddw a0, a0, a1 +; RV32P-NEXT: sw a0, 176(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ukcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ukcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukstas16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ukstas16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ukstsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ukstsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: uksub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: uksub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: uksub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: uksub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: uksubh a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: uksubw a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 172(sp) +; RV32P-NEXT: lw a1, 168(sp) +; RV32P-NEXT: lw a2, 164(sp) +; RV32P-NEXT: umaqa a0, a1, a2 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 204(sp) +; RV32P-NEXT: lw a1, 72(sp) +; RV32P-NEXT: lw a2, 68(sp) +; RV32P-NEXT: umaqa a0, a1, a2 +; RV32P-NEXT: sw a0, 192(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: umax8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: umax8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: umax16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: umax16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: umin8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: umin8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: umin16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: umin16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: uradd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: uradd8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: uradd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: uradd16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: uraddw a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: urcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: urcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: urcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: urcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: urcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: urcras16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: urcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: urcrsa16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ursub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: lw a1, 68(sp) +; RV32P-NEXT: ursub8 a0, a0, a1 +; RV32P-NEXT: sw a0, 64(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: lw a1, 164(sp) +; RV32P-NEXT: ursub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 48(sp) +; RV32P-NEXT: lw a1, 44(sp) +; RV32P-NEXT: ursub16 a0, a0, a1 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 200(sp) +; RV32P-NEXT: lw a1, 196(sp) +; RV32P-NEXT: ursubw a0, a0, a1 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: zunpkd810 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: zunpkd810 a0, a0 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: zunpkd820 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: zunpkd820 a0, a0 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: zunpkd830 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: zunpkd830 a0, a0 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: zunpkd831 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: zunpkd831 a0, a0 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: lw a0, 168(sp) +; RV32P-NEXT: zunpkd832 a0, a0 +; RV32P-NEXT: sw a0, 156(sp) +; RV32P-NEXT: lw a0, 72(sp) +; RV32P-NEXT: zunpkd832 a0, a0 +; RV32P-NEXT: sw a0, 40(sp) +; RV32P-NEXT: addi sp, sp, 224 +; RV32P-NEXT: ret +entry: + %i_t = alloca i32, align 4 + %i_a = alloca i32, align 4 + %i_b = alloca i32, align 4 + %i_r = alloca i32, align 4 + %ui_t = alloca i32, align 4 + %ui_a = alloca i32, align 4 + %ui_b = alloca i32, align 4 + %ui_r = alloca i32, align 4 + %l_t = alloca i32, align 4 + %l_a = alloca i32, align 4 + %l_b = alloca i32, align 4 + %l_r = alloca i32, align 4 + %ul_t = alloca i32, align 4 + %ul_a = alloca i32, align 4 + %ul_b = alloca i32, align 4 + %ul_c = alloca i32, align 4 + %ul_r = alloca i32, align 4 + %ll_t = alloca i64, align 8 + %ll_a = alloca i64, align 8 + %ll_b = alloca i64, align 8 + %ll_r = alloca i64, align 8 + %ull_t = alloca i64, align 8 + %ull_a = alloca i64, align 8 + %ull_b = alloca i64, align 8 + %ull_r = alloca i64, align 8 + %i8x4_a = alloca <4 x i8>, align 4 + %i8x4_b = alloca <4 x i8>, align 4 + %i8x4_r = alloca <4 x i8>, align 4 + %u8x4_a = alloca <4 x i8>, align 4 + %u8x4_b = alloca <4 x i8>, align 4 + %u8x4_r = alloca <4 x i8>, align 4 + %i16x2_a = alloca <2 x i16>, align 4 + %i16x2_b = alloca <2 x i16>, align 4 + %i16x2_r = alloca <2 x i16>, align 4 + %u16x2_a = alloca <2 x i16>, align 4 + %u16x2_b = alloca <2 x i16>, align 4 + %u16x2_r = alloca <2 x i16>, align 4 + %i16x4_r = alloca <4 x i16>, align 8 + %u16x4_r = alloca <4 x i16>, align 8 + %i32x2_r = alloca <2 x i32>, align 8 + %u32x2_r = alloca <2 x i32>, align 8 + store volatile i32 0, i32* %i_t, align 4 + store volatile i32 0, i32* %i_a, align 4 + store volatile i32 1, i32* %i_b, align 4 + store volatile i32 0, i32* %ui_t, align 4 + store volatile i32 1, i32* %ui_a, align 4 + store volatile i32 2, i32* %ui_b, align 4 + store volatile i32 0, i32* %l_t, align 4 + store volatile i32 1, i32* %l_a, align 4 + store volatile i32 2, i32* %l_b, align 4 + store volatile i32 0, i32* %ul_t, align 4 + store volatile i32 1, i32* %ul_a, align 4 + store volatile i32 2, i32* %ul_b, align 4 + store volatile i32 3, i32* %ul_c, align 4 + store volatile i64 0, i64* %ll_t, align 8 + store volatile i64 1, i64* %ll_a, align 8 + store volatile i64 2, i64* %ll_b, align 8 + store volatile i64 0, i64* %ull_t, align 8 + store volatile i64 1, i64* %ull_a, align 8 + store volatile i64 2, i64* %ull_b, align 8 + store volatile <4 x i8> , <4 x i8>* %i8x4_a, align 4 + store volatile <4 x i8> , <4 x i8>* %i8x4_b, align 4 + store volatile <4 x i8> , <4 x i8>* %u8x4_a, align 4 + store volatile <4 x i8> , <4 x i8>* %u8x4_b, align 4 + store volatile <2 x i16> , <2 x i16>* %i16x2_a, align 4 + store volatile <2 x i16> , <2 x i16>* %i16x2_b, align 4 + store volatile <2 x i16> , <2 x i16>* %u16x2_a, align 4 + store volatile <2 x i16> , <2 x i16>* %u16x2_b, align 4 + %0 = load i32, i32* %ul_a, align 4 + %1 = load i32, i32* %ul_b, align 4 + %2 = call i32 @llvm.riscv.add8.i32(i32 %0, i32 %1) + store volatile i32 %2, i32* %ul_r, align 4 + %3 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %4 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %5 = call <4 x i8> @llvm.riscv.v.add8.v4i8(<4 x i8> %3, <4 x i8> %4) + store volatile <4 x i8> %5, <4 x i8>* %u8x4_r, align 4 + %6 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %7 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %8 = call <4 x i8> @llvm.riscv.v.add8.v4i8(<4 x i8> %6, <4 x i8> %7) + store volatile <4 x i8> %8, <4 x i8>* %i8x4_r, align 4 + %9 = load i32, i32* %ul_a, align 4 + %10 = load i32, i32* %ul_b, align 4 + %11 = call i32 @llvm.riscv.add16.i32(i32 %9, i32 %10) + store volatile i32 %11, i32* %ul_r, align 4 + %12 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %13 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %14 = call <2 x i16> @llvm.riscv.v.add16.v2i16(<2 x i16> %12, <2 x i16> %13) + store volatile <2 x i16> %14, <2 x i16>* %u16x2_r, align 4 + %15 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %16 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %17 = call <2 x i16> @llvm.riscv.v.add16.v2i16(<2 x i16> %15, <2 x i16> %16) + store volatile <2 x i16> %17, <2 x i16>* %i16x2_r, align 4 + %18 = load i32, i32* %l_a, align 4 + %19 = load i32, i32* %l_b, align 4 + %20 = call i32 @llvm.riscv.ave.i32(i32 %18, i32 %19) + store volatile i32 %20, i32* %l_r, align 4 + %21 = load i32, i32* %ul_a, align 4 + %22 = load i32, i32* %ul_b, align 4 + %23 = call i32 @llvm.riscv.bitrev.i32(i32 %21, i32 %22) + store volatile i32 %23, i32* %ul_r, align 4 + %24 = load i32, i32* %ul_a, align 4 + %25 = load i32, i32* %ul_b, align 4 + %26 = load i32, i32* %ul_c, align 4 + %27 = call i32 @llvm.riscv.bpick.i32(i32 %24, i32 %25, i32 %26) + store volatile i32 %27, i32* %ul_r, align 4 + %28 = load i32, i32* %ul_a, align 4 + %29 = call i32 @llvm.riscv.clrs8.i32(i32 %28) + store volatile i32 %29, i32* %ul_r, align 4 + %30 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %31 = call <4 x i8> @llvm.riscv.v.clrs8.v4i8(<4 x i8> %30) + store volatile <4 x i8> %31, <4 x i8>* %u8x4_r, align 4 + %32 = load i32, i32* %ul_a, align 4 + %33 = call i32 @llvm.riscv.clrs16.i32(i32 %32) + store volatile i32 %33, i32* %ul_r, align 4 + %34 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %35 = call <2 x i16> @llvm.riscv.v.clrs16.v2i16(<2 x i16> %34) + store volatile <2 x i16> %35, <2 x i16>* %u16x2_r, align 4 + %36 = load i32, i32* %ul_a, align 4 + %37 = call i32 @llvm.riscv.clrs32.i32(i32 %36) + store volatile i32 %37, i32* %ul_r, align 4 + %38 = load i32, i32* %ul_a, align 4 + %39 = call i32 @llvm.riscv.clo8.i32(i32 %38) + store volatile i32 %39, i32* %ul_r, align 4 + %40 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %41 = call <4 x i8> @llvm.riscv.v.clo8.v4i8(<4 x i8> %40) + store volatile <4 x i8> %41, <4 x i8>* %u8x4_r, align 4 + %42 = load i32, i32* %ul_a, align 4 + %43 = call i32 @llvm.riscv.clo16.i32(i32 %42) + store volatile i32 %43, i32* %ul_r, align 4 + %44 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %45 = call <2 x i16> @llvm.riscv.v.clo16.v2i16(<2 x i16> %44) + store volatile <2 x i16> %45, <2 x i16>* %u16x2_r, align 4 + %46 = load i32, i32* %ul_a, align 4 + %47 = call i32 @llvm.riscv.clo32.i32(i32 %46) + store volatile i32 %47, i32* %ul_r, align 4 + %48 = load i32, i32* %ul_a, align 4 + %49 = call i32 @llvm.riscv.clz8.i32(i32 %48) + store volatile i32 %49, i32* %ul_r, align 4 + %50 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %51 = call <4 x i8> @llvm.riscv.v.clz8.v4i8(<4 x i8> %50) + store volatile <4 x i8> %51, <4 x i8>* %u8x4_r, align 4 + %52 = load i32, i32* %ul_a, align 4 + %53 = call i32 @llvm.riscv.clz16.i32(i32 %52) + store volatile i32 %53, i32* %ul_r, align 4 + %54 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %55 = call <2 x i16> @llvm.riscv.v.clz16.v2i16(<2 x i16> %54) + store volatile <2 x i16> %55, <2 x i16>* %u16x2_r, align 4 + %56 = load i32, i32* %ul_a, align 4 + %57 = call i32 @llvm.riscv.clz32.i32(i32 %56) + store volatile i32 %57, i32* %ul_r, align 4 + %58 = load i32, i32* %ul_a, align 4 + %59 = load i32, i32* %ul_b, align 4 + %60 = call i32 @llvm.riscv.cmpeq8.i32(i32 %58, i32 %59) + store volatile i32 %60, i32* %ul_r, align 4 + %61 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %62 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %63 = call <4 x i8> @llvm.riscv.v.cmpeq8.v4i8(<4 x i8> %61, <4 x i8> %62) + store volatile <4 x i8> %63, <4 x i8>* %u8x4_r, align 4 + %64 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %65 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %66 = call <4 x i8> @llvm.riscv.v.cmpeq8.v4i8(<4 x i8> %64, <4 x i8> %65) + store volatile <4 x i8> %66, <4 x i8>* %u8x4_r, align 4 + %67 = load i32, i32* %ul_a, align 4 + %68 = load i32, i32* %ul_b, align 4 + %69 = call i32 @llvm.riscv.cmpeq16.i32(i32 %67, i32 %68) + store volatile i32 %69, i32* %ul_r, align 4 + %70 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %71 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %72 = call <2 x i16> @llvm.riscv.v.cmpeq16.v2i16(<2 x i16> %70, <2 x i16> %71) + store volatile <2 x i16> %72, <2 x i16>* %u16x2_r, align 4 + %73 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %74 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %75 = call <2 x i16> @llvm.riscv.v.cmpeq16.v2i16(<2 x i16> %73, <2 x i16> %74) + store volatile <2 x i16> %75, <2 x i16>* %u16x2_r, align 4 + %76 = load i32, i32* %ul_a, align 4 + %77 = load i32, i32* %ul_b, align 4 + %78 = call i32 @llvm.riscv.cras16.i32(i32 %76, i32 %77) + store volatile i32 %78, i32* %ul_r, align 4 + %79 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %80 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %81 = call <2 x i16> @llvm.riscv.v.cras16.v2i16(<2 x i16> %79, <2 x i16> %80) + store volatile <2 x i16> %81, <2 x i16>* %u16x2_r, align 4 + %82 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %83 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %84 = call <2 x i16> @llvm.riscv.v.cras16.v2i16(<2 x i16> %82, <2 x i16> %83) + store volatile <2 x i16> %84, <2 x i16>* %i16x2_r, align 4 + %85 = load i32, i32* %ul_a, align 4 + %86 = load i32, i32* %ul_b, align 4 + %87 = call i32 @llvm.riscv.crsa16.i32(i32 %85, i32 %86) + store volatile i32 %87, i32* %ul_r, align 4 + %88 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %89 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %90 = call <2 x i16> @llvm.riscv.v.crsa16.v2i16(<2 x i16> %88, <2 x i16> %89) + store volatile <2 x i16> %90, <2 x i16>* %u16x2_r, align 4 + %91 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %92 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %93 = call <2 x i16> @llvm.riscv.v.crsa16.v2i16(<2 x i16> %91, <2 x i16> %92) + store volatile <2 x i16> %93, <2 x i16>* %i16x2_r, align 4 + %94 = load i32, i32* %ul_a, align 4 + %95 = load i32, i32* %ul_b, align 4 + %96 = call i32 @llvm.riscv.insb.i32(i32 %94, i32 %95, i32 3) + store volatile i32 %96, i32* %ul_r, align 4 + %97 = load i32, i32* %ul_a, align 4 + %98 = call i32 @llvm.riscv.kabs8.i32(i32 %97) + store volatile i32 %98, i32* %ul_r, align 4 + %99 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %100 = call <4 x i8> @llvm.riscv.v.kabs8.v4i8(<4 x i8> %99) + store volatile <4 x i8> %100, <4 x i8>* %i8x4_r, align 4 + %101 = load i32, i32* %ul_a, align 4 + %102 = call i32 @llvm.riscv.kabs16.i32(i32 %101) + store volatile i32 %102, i32* %ul_r, align 4 + %103 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %104 = call <2 x i16> @llvm.riscv.v.kabs16.v2i16(<2 x i16> %103) + store volatile <2 x i16> %104, <2 x i16>* %i16x2_r, align 4 + %105 = load i32, i32* %l_a, align 4 + %106 = call i32 @llvm.riscv.kabsw.i32(i32 %105) + store volatile i32 %106, i32* %l_r, align 4 + %107 = load i32, i32* %ul_a, align 4 + %108 = load i32, i32* %ul_b, align 4 + %109 = call i32 @llvm.riscv.kadd8.i32(i32 %107, i32 %108) + store volatile i32 %109, i32* %ul_r, align 4 + %110 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %111 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %112 = call <4 x i8> @llvm.riscv.v.kadd8.v4i8(<4 x i8> %110, <4 x i8> %111) + store volatile <4 x i8> %112, <4 x i8>* %i8x4_r, align 4 + %113 = load i32, i32* %ul_a, align 4 + %114 = load i32, i32* %ul_b, align 4 + %115 = call i32 @llvm.riscv.kadd16.i32(i32 %113, i32 %114) + store volatile i32 %115, i32* %ul_r, align 4 + %116 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %117 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %118 = call <2 x i16> @llvm.riscv.v.kadd16.v2i16(<2 x i16> %116, <2 x i16> %117) + store volatile <2 x i16> %118, <2 x i16>* %i16x2_r, align 4 + %119 = load i32, i32* %i_a, align 4 + %120 = load i32, i32* %i_b, align 4 + %121 = call i32 @llvm.riscv.kaddh.i32(i32 %119, i32 %120) + store volatile i32 %121, i32* %l_r, align 4 + %122 = load i32, i32* %i_a, align 4 + %123 = load i32, i32* %i_b, align 4 + %124 = call i32 @llvm.riscv.kaddw.i32(i32 %122, i32 %123) + store volatile i32 %124, i32* %l_r, align 4 + %125 = load i32, i32* %ul_a, align 4 + %126 = load i32, i32* %ul_b, align 4 + %127 = call i32 @llvm.riscv.kcras16.i32(i32 %125, i32 %126) + store volatile i32 %127, i32* %ul_r, align 4 + %128 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %129 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %130 = call <2 x i16> @llvm.riscv.v.kcras16.v2i16(<2 x i16> %128, <2 x i16> %129) + store volatile <2 x i16> %130, <2 x i16>* %i16x2_r, align 4 + %131 = load i32, i32* %ul_a, align 4 + %132 = load i32, i32* %ul_b, align 4 + %133 = call i32 @llvm.riscv.kcrsa16.i32(i32 %131, i32 %132) + store volatile i32 %133, i32* %ul_r, align 4 + %134 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %135 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %136 = call <2 x i16> @llvm.riscv.v.kcrsa16.v2i16(<2 x i16> %134, <2 x i16> %135) + store volatile <2 x i16> %136, <2 x i16>* %i16x2_r, align 4 + %137 = load i32, i32* %ui_a, align 4 + %138 = load i32, i32* %ui_b, align 4 + %139 = call i32 @llvm.riscv.kdmbb.i32(i32 %137, i32 %138) + store volatile i32 %139, i32* %l_r, align 4 + %140 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %141 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %142 = call i32 @llvm.riscv.v.kdmbb.i32.v2i16(<2 x i16> %140, <2 x i16> %141) + store volatile i32 %142, i32* %l_r, align 4 + %143 = load i32, i32* %ui_a, align 4 + %144 = load i32, i32* %ui_b, align 4 + %145 = call i32 @llvm.riscv.kdmbt.i32(i32 %143, i32 %144) + store volatile i32 %145, i32* %l_r, align 4 + %146 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %147 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %148 = call i32 @llvm.riscv.v.kdmbt.i32.v2i16(<2 x i16> %146, <2 x i16> %147) + store volatile i32 %148, i32* %l_r, align 4 + %149 = load i32, i32* %ui_a, align 4 + %150 = load i32, i32* %ui_b, align 4 + %151 = call i32 @llvm.riscv.kdmtt.i32(i32 %149, i32 %150) + store volatile i32 %151, i32* %l_r, align 4 + %152 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %153 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %154 = call i32 @llvm.riscv.v.kdmtt.i32.v2i16(<2 x i16> %152, <2 x i16> %153) + store volatile i32 %154, i32* %l_r, align 4 + %155 = load i32, i32* %l_t, align 4 + %156 = load i32, i32* %ui_a, align 4 + %157 = load i32, i32* %ui_b, align 4 + %158 = call i32 @llvm.riscv.kdmabb.i32(i32 %155, i32 %156, i32 %157) + store volatile i32 %158, i32* %l_r, align 4 + %159 = load i32, i32* %l_t, align 4 + %160 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %161 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %162 = call i32 @llvm.riscv.v.kdmabb.i32.v2i16(i32 %159, <2 x i16> %160, <2 x i16> %161) + store volatile i32 %162, i32* %l_r, align 4 + %163 = load i32, i32* %l_t, align 4 + %164 = load i32, i32* %ui_a, align 4 + %165 = load i32, i32* %ui_b, align 4 + %166 = call i32 @llvm.riscv.kdmabt.i32(i32 %163, i32 %164, i32 %165) + store volatile i32 %166, i32* %l_r, align 4 + %167 = load i32, i32* %l_t, align 4 + %168 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %169 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %170 = call i32 @llvm.riscv.v.kdmabt.i32.v2i16(i32 %167, <2 x i16> %168, <2 x i16> %169) + store volatile i32 %170, i32* %l_r, align 4 + %171 = load i32, i32* %l_t, align 4 + %172 = load i32, i32* %ui_a, align 4 + %173 = load i32, i32* %ui_b, align 4 + %174 = call i32 @llvm.riscv.kdmatt.i32(i32 %171, i32 %172, i32 %173) + store volatile i32 %174, i32* %l_r, align 4 + %175 = load i32, i32* %l_t, align 4 + %176 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %177 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %178 = call i32 @llvm.riscv.v.kdmatt.i32.v2i16(i32 %175, <2 x i16> %176, <2 x i16> %177) + store volatile i32 %178, i32* %l_r, align 4 + %179 = load i32, i32* %ul_a, align 4 + %180 = load i32, i32* %ul_b, align 4 + %181 = call i32 @llvm.riscv.khm8.i32(i32 %179, i32 %180) + store volatile i32 %181, i32* %ul_r, align 4 + %182 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %183 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %184 = call <4 x i8> @llvm.riscv.v.khm8.v4i8(<4 x i8> %182, <4 x i8> %183) + store volatile <4 x i8> %184, <4 x i8>* %i8x4_r, align 4 + %185 = load i32, i32* %ul_a, align 4 + %186 = load i32, i32* %ul_b, align 4 + %187 = call i32 @llvm.riscv.khmx8.i32(i32 %185, i32 %186) + store volatile i32 %187, i32* %ul_r, align 4 + %188 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %189 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %190 = call <4 x i8> @llvm.riscv.v.khmx8.v4i8(<4 x i8> %188, <4 x i8> %189) + store volatile <4 x i8> %190, <4 x i8>* %i8x4_r, align 4 + %191 = load i32, i32* %ul_a, align 4 + %192 = load i32, i32* %ul_b, align 4 + %193 = call i32 @llvm.riscv.khm16.i32(i32 %191, i32 %192) + store volatile i32 %193, i32* %ul_r, align 4 + %194 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %195 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %196 = call <2 x i16> @llvm.riscv.v.khm16.v2i16(<2 x i16> %194, <2 x i16> %195) + store volatile <2 x i16> %196, <2 x i16>* %i16x2_r, align 4 + %197 = load i32, i32* %ul_a, align 4 + %198 = load i32, i32* %ul_b, align 4 + %199 = call i32 @llvm.riscv.khmx16.i32(i32 %197, i32 %198) + store volatile i32 %199, i32* %ul_r, align 4 + %200 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %201 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %202 = call <2 x i16> @llvm.riscv.v.khmx16.v2i16(<2 x i16> %200, <2 x i16> %201) + store volatile <2 x i16> %202, <2 x i16>* %i16x2_r, align 4 + %203 = load i32, i32* %ui_a, align 4 + %204 = load i32, i32* %ui_b, align 4 + %205 = call i32 @llvm.riscv.khmbb.i32(i32 %203, i32 %204) + store volatile i32 %205, i32* %l_r, align 4 + %206 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %207 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %208 = call i32 @llvm.riscv.v.khmbb.i32.v2i16(<2 x i16> %206, <2 x i16> %207) + store volatile i32 %208, i32* %l_r, align 4 + %209 = load i32, i32* %ui_a, align 4 + %210 = load i32, i32* %ui_b, align 4 + %211 = call i32 @llvm.riscv.khmbt.i32(i32 %209, i32 %210) + store volatile i32 %211, i32* %l_r, align 4 + %212 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %213 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %214 = call i32 @llvm.riscv.v.khmbt.i32.v2i16(<2 x i16> %212, <2 x i16> %213) + store volatile i32 %214, i32* %l_r, align 4 + %215 = load i32, i32* %ui_a, align 4 + %216 = load i32, i32* %ui_b, align 4 + %217 = call i32 @llvm.riscv.khmtt.i32(i32 %215, i32 %216) + store volatile i32 %217, i32* %l_r, align 4 + %218 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %219 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %220 = call i32 @llvm.riscv.v.khmtt.i32.v2i16(<2 x i16> %218, <2 x i16> %219) + store volatile i32 %220, i32* %l_r, align 4 + %221 = load i32, i32* %l_t, align 4 + %222 = load i32, i32* %ul_a, align 4 + %223 = load i32, i32* %ul_b, align 4 + %224 = call i32 @llvm.riscv.kmabb.i32(i32 %221, i32 %222, i32 %223) + store volatile i32 %224, i32* %l_r, align 4 + %225 = load i32, i32* %l_t, align 4 + %226 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %227 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %228 = call i32 @llvm.riscv.v.kmabb.i32.v2i16(i32 %225, <2 x i16> %226, <2 x i16> %227) + store volatile i32 %228, i32* %l_r, align 4 + %229 = load i32, i32* %l_t, align 4 + %230 = load i32, i32* %ul_a, align 4 + %231 = load i32, i32* %ul_b, align 4 + %232 = call i32 @llvm.riscv.kmabt.i32(i32 %229, i32 %230, i32 %231) + store volatile i32 %232, i32* %l_r, align 4 + %233 = load i32, i32* %l_t, align 4 + %234 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %235 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %236 = call i32 @llvm.riscv.v.kmabt.i32.v2i16(i32 %233, <2 x i16> %234, <2 x i16> %235) + store volatile i32 %236, i32* %l_r, align 4 + %237 = load i32, i32* %l_t, align 4 + %238 = load i32, i32* %ul_a, align 4 + %239 = load i32, i32* %ul_b, align 4 + %240 = call i32 @llvm.riscv.kmatt.i32(i32 %237, i32 %238, i32 %239) + store volatile i32 %240, i32* %l_r, align 4 + %241 = load i32, i32* %l_t, align 4 + %242 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %243 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %244 = call i32 @llvm.riscv.v.kmatt.i32.v2i16(i32 %241, <2 x i16> %242, <2 x i16> %243) + store volatile i32 %244, i32* %l_r, align 4 + %245 = load i32, i32* %l_t, align 4 + %246 = load i32, i32* %ul_a, align 4 + %247 = load i32, i32* %ul_b, align 4 + %248 = call i32 @llvm.riscv.kmada.i32(i32 %245, i32 %246, i32 %247) + store volatile i32 %248, i32* %l_r, align 4 + %249 = load i32, i32* %l_t, align 4 + %250 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %251 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %252 = call i32 @llvm.riscv.v.kmada.i32.v2i16(i32 %249, <2 x i16> %250, <2 x i16> %251) + store volatile i32 %252, i32* %l_r, align 4 + %253 = load i32, i32* %l_t, align 4 + %254 = load i32, i32* %ul_a, align 4 + %255 = load i32, i32* %ul_b, align 4 + %256 = call i32 @llvm.riscv.kmaxda.i32(i32 %253, i32 %254, i32 %255) + store volatile i32 %256, i32* %l_r, align 4 + %257 = load i32, i32* %l_t, align 4 + %258 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %259 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %260 = call i32 @llvm.riscv.v.kmaxda.i32.v2i16(i32 %257, <2 x i16> %258, <2 x i16> %259) + store volatile i32 %260, i32* %l_r, align 4 + %261 = load i32, i32* %l_t, align 4 + %262 = load i32, i32* %ul_a, align 4 + %263 = load i32, i32* %ul_b, align 4 + %264 = call i32 @llvm.riscv.kmads.i32(i32 %261, i32 %262, i32 %263) + store volatile i32 %264, i32* %l_r, align 4 + %265 = load i32, i32* %l_t, align 4 + %266 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %267 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %268 = call i32 @llvm.riscv.v.kmads.i32.v2i16(i32 %265, <2 x i16> %266, <2 x i16> %267) + store volatile i32 %268, i32* %l_r, align 4 + %269 = load i32, i32* %l_t, align 4 + %270 = load i32, i32* %ul_a, align 4 + %271 = load i32, i32* %ul_b, align 4 + %272 = call i32 @llvm.riscv.kmadrs.i32(i32 %269, i32 %270, i32 %271) + store volatile i32 %272, i32* %l_r, align 4 + %273 = load i32, i32* %l_t, align 4 + %274 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %275 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %276 = call i32 @llvm.riscv.v.kmadrs.i32.v2i16(i32 %273, <2 x i16> %274, <2 x i16> %275) + store volatile i32 %276, i32* %l_r, align 4 + %277 = load i32, i32* %l_t, align 4 + %278 = load i32, i32* %ul_a, align 4 + %279 = load i32, i32* %ul_b, align 4 + %280 = call i32 @llvm.riscv.kmaxds.i32(i32 %277, i32 %278, i32 %279) + store volatile i32 %280, i32* %l_r, align 4 + %281 = load i32, i32* %l_t, align 4 + %282 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %283 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %284 = call i32 @llvm.riscv.v.kmaxds.i32.v2i16(i32 %281, <2 x i16> %282, <2 x i16> %283) + store volatile i32 %284, i32* %l_r, align 4 + %285 = load i32, i32* %ul_a, align 4 + %286 = load i32, i32* %ul_b, align 4 + %287 = call i32 @llvm.riscv.kmda.i32(i32 %285, i32 %286) + store volatile i32 %287, i32* %l_r, align 4 + %288 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %289 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %290 = call i32 @llvm.riscv.v.kmda.i32.v2i16(<2 x i16> %288, <2 x i16> %289) + store volatile i32 %290, i32* %l_r, align 4 + %291 = load i32, i32* %ul_a, align 4 + %292 = load i32, i32* %ul_b, align 4 + %293 = call i32 @llvm.riscv.kmxda.i32(i32 %291, i32 %292) + store volatile i32 %293, i32* %l_r, align 4 + %294 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %295 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %296 = call i32 @llvm.riscv.v.kmxda.i32.v2i16(<2 x i16> %294, <2 x i16> %295) + store volatile i32 %296, i32* %l_r, align 4 + %297 = load i32, i32* %l_t, align 4 + %298 = load i32, i32* %l_a, align 4 + %299 = load i32, i32* %l_b, align 4 + %300 = call i32 @llvm.riscv.kmmac.i32(i32 %297, i32 %298, i32 %299) + store volatile i32 %300, i32* %l_r, align 4 + %301 = load i32, i32* %l_t, align 4 + %302 = load i32, i32* %l_a, align 4 + %303 = load i32, i32* %l_b, align 4 + %304 = call i32 @llvm.riscv.kmmac.u.i32(i32 %301, i32 %302, i32 %303) + store volatile i32 %304, i32* %l_r, align 4 + %305 = load i32, i32* %l_t, align 4 + %306 = load i32, i32* %ul_a, align 4 + %307 = load i32, i32* %ul_b, align 4 + %308 = call i32 @llvm.riscv.kmmawb.i32(i32 %305, i32 %306, i32 %307) + store volatile i32 %308, i32* %l_r, align 4 + %309 = load i32, i32* %i_t, align 4 + %310 = load i32, i32* %i_a, align 4 + %311 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %312 = call i32 @llvm.riscv.v.kmmawb.i32.v2i16(i32 %309, i32 %310, <2 x i16> %311) + store volatile i32 %312, i32* %i_r, align 4 + %313 = load i32, i32* %l_t, align 4 + %314 = load i32, i32* %ul_a, align 4 + %315 = load i32, i32* %ul_b, align 4 + %316 = call i32 @llvm.riscv.kmmawb.u.i32(i32 %313, i32 %314, i32 %315) + store volatile i32 %316, i32* %l_r, align 4 + %317 = load i32, i32* %i_t, align 4 + %318 = load i32, i32* %i_a, align 4 + %319 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %320 = call i32 @llvm.riscv.v.kmmawb.u.i32.v2i16(i32 %317, i32 %318, <2 x i16> %319) + store volatile i32 %320, i32* %i_r, align 4 + %321 = load i32, i32* %l_t, align 4 + %322 = load i32, i32* %ul_a, align 4 + %323 = load i32, i32* %ul_b, align 4 + %324 = call i32 @llvm.riscv.kmmawb2.i32(i32 %321, i32 %322, i32 %323) + store volatile i32 %324, i32* %l_r, align 4 + %325 = load i32, i32* %i_t, align 4 + %326 = load i32, i32* %i_a, align 4 + %327 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %328 = call i32 @llvm.riscv.v.kmmawb2.i32.v2i16(i32 %325, i32 %326, <2 x i16> %327) + store volatile i32 %328, i32* %i_r, align 4 + %329 = load i32, i32* %l_t, align 4 + %330 = load i32, i32* %ul_a, align 4 + %331 = load i32, i32* %ul_b, align 4 + %332 = call i32 @llvm.riscv.kmmawb2.u.i32(i32 %329, i32 %330, i32 %331) + store volatile i32 %332, i32* %l_r, align 4 + %333 = load i32, i32* %i_t, align 4 + %334 = load i32, i32* %i_a, align 4 + %335 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %336 = call i32 @llvm.riscv.v.kmmawb2.u.i32.v2i16(i32 %333, i32 %334, <2 x i16> %335) + store volatile i32 %336, i32* %i_r, align 4 + %337 = load i32, i32* %l_t, align 4 + %338 = load i32, i32* %ul_a, align 4 + %339 = load i32, i32* %ul_b, align 4 + %340 = call i32 @llvm.riscv.kmmawt.i32(i32 %337, i32 %338, i32 %339) + store volatile i32 %340, i32* %l_r, align 4 + %341 = load i32, i32* %i_t, align 4 + %342 = load i32, i32* %i_a, align 4 + %343 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %344 = call i32 @llvm.riscv.v.kmmawt.i32.v2i16(i32 %341, i32 %342, <2 x i16> %343) + store volatile i32 %344, i32* %i_r, align 4 + %345 = load i32, i32* %l_t, align 4 + %346 = load i32, i32* %ul_a, align 4 + %347 = load i32, i32* %ul_b, align 4 + %348 = call i32 @llvm.riscv.kmmawt.u.i32(i32 %345, i32 %346, i32 %347) + store volatile i32 %348, i32* %l_r, align 4 + %349 = load i32, i32* %i_t, align 4 + %350 = load i32, i32* %i_a, align 4 + %351 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %352 = call i32 @llvm.riscv.v.kmmawt.u.i32.v2i16(i32 %349, i32 %350, <2 x i16> %351) + store volatile i32 %352, i32* %i_r, align 4 + %353 = load i32, i32* %l_t, align 4 + %354 = load i32, i32* %ul_a, align 4 + %355 = load i32, i32* %ul_b, align 4 + %356 = call i32 @llvm.riscv.kmmawt2.i32(i32 %353, i32 %354, i32 %355) + store volatile i32 %356, i32* %l_r, align 4 + %357 = load i32, i32* %i_t, align 4 + %358 = load i32, i32* %i_a, align 4 + %359 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %360 = call i32 @llvm.riscv.v.kmmawt2.i32.v2i16(i32 %357, i32 %358, <2 x i16> %359) + store volatile i32 %360, i32* %i_r, align 4 + %361 = load i32, i32* %l_t, align 4 + %362 = load i32, i32* %ul_a, align 4 + %363 = load i32, i32* %ul_b, align 4 + %364 = call i32 @llvm.riscv.kmmawt2.u.i32(i32 %361, i32 %362, i32 %363) + store volatile i32 %364, i32* %l_r, align 4 + %365 = load i32, i32* %i_t, align 4 + %366 = load i32, i32* %i_a, align 4 + %367 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %368 = call i32 @llvm.riscv.v.kmmawt2.u.i32.v2i16(i32 %365, i32 %366, <2 x i16> %367) + store volatile i32 %368, i32* %i_r, align 4 + %369 = load i32, i32* %l_t, align 4 + %370 = load i32, i32* %l_a, align 4 + %371 = load i32, i32* %l_b, align 4 + %372 = call i32 @llvm.riscv.kmmsb.i32(i32 %369, i32 %370, i32 %371) + store volatile i32 %372, i32* %l_r, align 4 + %373 = load i32, i32* %l_t, align 4 + %374 = load i32, i32* %l_a, align 4 + %375 = load i32, i32* %l_b, align 4 + %376 = call i32 @llvm.riscv.kmmsb.u.i32(i32 %373, i32 %374, i32 %375) + store volatile i32 %376, i32* %l_r, align 4 + %377 = load i32, i32* %l_a, align 4 + %378 = load i32, i32* %ul_b, align 4 + %379 = call i32 @llvm.riscv.kmmwb2.i32(i32 %377, i32 %378) + store volatile i32 %379, i32* %l_r, align 4 + %380 = load i32, i32* %i_a, align 4 + %381 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %382 = call i32 @llvm.riscv.v.kmmwb2.i32.v2i16(i32 %380, <2 x i16> %381) + store volatile i32 %382, i32* %i_r, align 4 + %383 = load i32, i32* %l_a, align 4 + %384 = load i32, i32* %ul_b, align 4 + %385 = call i32 @llvm.riscv.kmmwb2.u.i32(i32 %383, i32 %384) + store volatile i32 %385, i32* %l_r, align 4 + %386 = load i32, i32* %i_a, align 4 + %387 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %388 = call i32 @llvm.riscv.v.kmmwb2.u.i32.v2i16(i32 %386, <2 x i16> %387) + store volatile i32 %388, i32* %i_r, align 4 + %389 = load i32, i32* %l_a, align 4 + %390 = load i32, i32* %ul_b, align 4 + %391 = call i32 @llvm.riscv.kmmwt2.i32(i32 %389, i32 %390) + store volatile i32 %391, i32* %l_r, align 4 + %392 = load i32, i32* %i_a, align 4 + %393 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %394 = call i32 @llvm.riscv.v.kmmwt2.i32.v2i16(i32 %392, <2 x i16> %393) + store volatile i32 %394, i32* %i_r, align 4 + %395 = load i32, i32* %l_a, align 4 + %396 = load i32, i32* %ul_b, align 4 + %397 = call i32 @llvm.riscv.kmmwt2.u.i32(i32 %395, i32 %396) + store volatile i32 %397, i32* %l_r, align 4 + %398 = load i32, i32* %i_a, align 4 + %399 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %400 = call i32 @llvm.riscv.v.kmmwt2.u.i32.v2i16(i32 %398, <2 x i16> %399) + store volatile i32 %400, i32* %i_r, align 4 + %401 = load i32, i32* %l_t, align 4 + %402 = load i32, i32* %ul_a, align 4 + %403 = load i32, i32* %ul_b, align 4 + %404 = call i32 @llvm.riscv.kmsda.i32(i32 %401, i32 %402, i32 %403) + store volatile i32 %404, i32* %l_r, align 4 + %405 = load i32, i32* %i_t, align 4 + %406 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %407 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %408 = call i32 @llvm.riscv.v.kmsda.i32.v2i16(i32 %405, <2 x i16> %406, <2 x i16> %407) + store volatile i32 %408, i32* %i_r, align 4 + %409 = load i32, i32* %l_t, align 4 + %410 = load i32, i32* %ul_a, align 4 + %411 = load i32, i32* %ul_b, align 4 + %412 = call i32 @llvm.riscv.kmsxda.i32(i32 %409, i32 %410, i32 %411) + store volatile i32 %412, i32* %l_r, align 4 + %413 = load i32, i32* %i_t, align 4 + %414 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %415 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %416 = call i32 @llvm.riscv.v.kmsxda.i32.v2i16(i32 %413, <2 x i16> %414, <2 x i16> %415) + store volatile i32 %416, i32* %i_r, align 4 + %417 = load i32, i32* %l_a, align 4 + %418 = load i32, i32* %i_b, align 4 + %419 = call i32 @llvm.riscv.ksllw.i32(i32 %417, i32 %418) + store volatile i32 %419, i32* %l_r, align 4 + %420 = load i32, i32* %ul_a, align 4 + %421 = load i32, i32* %ui_b, align 4 + %422 = call i32 @llvm.riscv.ksll8.i32(i32 %420, i32 %421) + store volatile i32 %422, i32* %ul_r, align 4 + %423 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %424 = load i32, i32* %ui_b, align 4 + %425 = call <4 x i8> @llvm.riscv.v.ksll8.v4i8.i32(<4 x i8> %423, i32 %424) + store volatile <4 x i8> %425, <4 x i8>* %i8x4_r, align 4 + %426 = load i32, i32* %ul_a, align 4 + %427 = load i32, i32* %ui_b, align 4 + %428 = call i32 @llvm.riscv.ksll16.i32(i32 %426, i32 %427) + store volatile i32 %428, i32* %ul_r, align 4 + %429 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %430 = load i32, i32* %ui_b, align 4 + %431 = call <2 x i16> @llvm.riscv.v.ksll16.v2i16.i32(<2 x i16> %429, i32 %430) + store volatile <2 x i16> %431, <2 x i16>* %i16x2_r, align 4 + %432 = load i32, i32* %ul_a, align 4 + %433 = load i32, i32* %i_b, align 4 + %434 = call i32 @llvm.riscv.kslra8.i32(i32 %432, i32 %433) + store volatile i32 %434, i32* %ul_r, align 4 + %435 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %436 = load i32, i32* %i_b, align 4 + %437 = call <4 x i8> @llvm.riscv.v.kslra8.v4i8.i32(<4 x i8> %435, i32 %436) + store volatile <4 x i8> %437, <4 x i8>* %i8x4_r, align 4 + %438 = load i32, i32* %ul_a, align 4 + %439 = load i32, i32* %i_b, align 4 + %440 = call i32 @llvm.riscv.kslra8.u.i32(i32 %438, i32 %439) + store volatile i32 %440, i32* %ul_r, align 4 + %441 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %442 = load i32, i32* %i_b, align 4 + %443 = call <4 x i8> @llvm.riscv.v.kslra8.u.v4i8.i32(<4 x i8> %441, i32 %442) + store volatile <4 x i8> %443, <4 x i8>* %i8x4_r, align 4 + %444 = load i32, i32* %ul_a, align 4 + %445 = load i32, i32* %i_b, align 4 + %446 = call i32 @llvm.riscv.kslra16.i32(i32 %444, i32 %445) + store volatile i32 %446, i32* %ul_r, align 4 + %447 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %448 = load i32, i32* %i_b, align 4 + %449 = call <2 x i16> @llvm.riscv.v.kslra16.v2i16.i32(<2 x i16> %447, i32 %448) + store volatile <2 x i16> %449, <2 x i16>* %i16x2_r, align 4 + %450 = load i32, i32* %ul_a, align 4 + %451 = load i32, i32* %i_b, align 4 + %452 = call i32 @llvm.riscv.kslra16.u.i32(i32 %450, i32 %451) + store volatile i32 %452, i32* %ul_r, align 4 + %453 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %454 = load i32, i32* %i_b, align 4 + %455 = call <2 x i16> @llvm.riscv.v.kslra16.u.v2i16.i32(<2 x i16> %453, i32 %454) + store volatile <2 x i16> %455, <2 x i16>* %i16x2_r, align 4 + %456 = load i32, i32* %ul_a, align 4 + %457 = load i32, i32* %ul_b, align 4 + %458 = call i32 @llvm.riscv.kstas16.i32(i32 %456, i32 %457) + store volatile i32 %458, i32* %ul_r, align 4 + %459 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %460 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %461 = call <2 x i16> @llvm.riscv.v.kstas16.v2i16(<2 x i16> %459, <2 x i16> %460) + store volatile <2 x i16> %461, <2 x i16>* %i16x2_r, align 4 + %462 = load i32, i32* %ul_a, align 4 + %463 = load i32, i32* %ul_b, align 4 + %464 = call i32 @llvm.riscv.kstsa16.i32(i32 %462, i32 %463) + store volatile i32 %464, i32* %ul_r, align 4 + %465 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %466 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %467 = call <2 x i16> @llvm.riscv.v.kstsa16.v2i16(<2 x i16> %465, <2 x i16> %466) + store volatile <2 x i16> %467, <2 x i16>* %i16x2_r, align 4 + %468 = load i32, i32* %ul_a, align 4 + %469 = load i32, i32* %ul_b, align 4 + %470 = call i32 @llvm.riscv.ksub8.i32(i32 %468, i32 %469) + store volatile i32 %470, i32* %ul_r, align 4 + %471 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %472 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %473 = call <4 x i8> @llvm.riscv.v.ksub8.v4i8(<4 x i8> %471, <4 x i8> %472) + store volatile <4 x i8> %473, <4 x i8>* %i8x4_r, align 4 + %474 = load i32, i32* %ul_a, align 4 + %475 = load i32, i32* %ul_b, align 4 + %476 = call i32 @llvm.riscv.ksub16.i32(i32 %474, i32 %475) + store volatile i32 %476, i32* %ul_r, align 4 + %477 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %478 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %479 = call <2 x i16> @llvm.riscv.v.ksub16.v2i16(<2 x i16> %477, <2 x i16> %478) + store volatile <2 x i16> %479, <2 x i16>* %i16x2_r, align 4 + %480 = load i32, i32* %i_a, align 4 + %481 = load i32, i32* %i_b, align 4 + %482 = call i32 @llvm.riscv.ksubh.i32(i32 %480, i32 %481) + store volatile i32 %482, i32* %l_r, align 4 + %483 = load i32, i32* %i_a, align 4 + %484 = load i32, i32* %i_b, align 4 + %485 = call i32 @llvm.riscv.ksubw.i32(i32 %483, i32 %484) + store volatile i32 %485, i32* %l_r, align 4 + %486 = load i32, i32* %l_a, align 4 + %487 = load i32, i32* %l_b, align 4 + %488 = call i32 @llvm.riscv.kwmmul.i32(i32 %486, i32 %487) + store volatile i32 %488, i32* %l_r, align 4 + %489 = load i32, i32* %l_a, align 4 + %490 = load i32, i32* %l_b, align 4 + %491 = call i32 @llvm.riscv.kwmmul.u.i32(i32 %489, i32 %490) + store volatile i32 %491, i32* %l_r, align 4 + %492 = load i32, i32* %i_a, align 4 + %493 = load i32, i32* %i_b, align 4 + %494 = call i32 @llvm.riscv.maxw.i32(i32 %492, i32 %493) + store volatile i32 %494, i32* %l_r, align 4 + %495 = load i32, i32* %i_a, align 4 + %496 = load i32, i32* %i_b, align 4 + %497 = call i32 @llvm.riscv.minw.i32(i32 %495, i32 %496) + store volatile i32 %497, i32* %l_r, align 4 + %498 = load i32, i32* %ul_a, align 4 + %499 = load i32, i32* %ul_b, align 4 + %500 = call i32 @llvm.riscv.pbsad.i32(i32 %498, i32 %499) + store volatile i32 %500, i32* %ul_r, align 4 + %501 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %502 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %503 = call i32 @llvm.riscv.v.pbsad.i32.v4i8(<4 x i8> %501, <4 x i8> %502) + store volatile i32 %503, i32* %ui_r, align 4 + %504 = load i32, i32* %ul_t, align 4 + %505 = load i32, i32* %ul_a, align 4 + %506 = load i32, i32* %ul_b, align 4 + %507 = call i32 @llvm.riscv.pbsada.i32(i32 %504, i32 %505, i32 %506) + store volatile i32 %507, i32* %ul_r, align 4 + %508 = load i32, i32* %ul_t, align 4 + %509 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %510 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %511 = call i32 @llvm.riscv.v.pbsada.i32.v4i8(i32 %508, <4 x i8> %509, <4 x i8> %510) + store volatile i32 %511, i32* %ui_r, align 4 + %512 = load i32, i32* %ul_a, align 4 + %513 = load i32, i32* %ul_b, align 4 + %514 = call i32 @llvm.riscv.pkbb16.i32(i32 %512, i32 %513) + store volatile i32 %514, i32* %ul_r, align 4 + %515 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %516 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %517 = call <2 x i16> @llvm.riscv.v.pkbb16.v2i16(<2 x i16> %515, <2 x i16> %516) + store volatile <2 x i16> %517, <2 x i16>* %u16x2_r, align 4 + %518 = load i32, i32* %ul_a, align 4 + %519 = load i32, i32* %ul_b, align 4 + %520 = call i32 @llvm.riscv.pkbt16.i32(i32 %518, i32 %519) + store volatile i32 %520, i32* %ul_r, align 4 + %521 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %522 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %523 = call <2 x i16> @llvm.riscv.v.pkbt16.v2i16(<2 x i16> %521, <2 x i16> %522) + store volatile <2 x i16> %523, <2 x i16>* %u16x2_r, align 4 + %524 = load i32, i32* %ul_a, align 4 + %525 = load i32, i32* %ul_b, align 4 + %526 = call i32 @llvm.riscv.pktt16.i32(i32 %524, i32 %525) + store volatile i32 %526, i32* %ul_r, align 4 + %527 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %528 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %529 = call <2 x i16> @llvm.riscv.v.pktt16.v2i16(<2 x i16> %527, <2 x i16> %528) + store volatile <2 x i16> %529, <2 x i16>* %u16x2_r, align 4 + %530 = load i32, i32* %ul_a, align 4 + %531 = load i32, i32* %ul_b, align 4 + %532 = call i32 @llvm.riscv.pktb16.i32(i32 %530, i32 %531) + store volatile i32 %532, i32* %ul_r, align 4 + %533 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %534 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %535 = call <2 x i16> @llvm.riscv.v.pktb16.v2i16(<2 x i16> %533, <2 x i16> %534) + store volatile <2 x i16> %535, <2 x i16>* %u16x2_r, align 4 + %536 = load i32, i32* %ul_a, align 4 + %537 = load i32, i32* %ul_b, align 4 + %538 = call i32 @llvm.riscv.radd8.i32(i32 %536, i32 %537) + store volatile i32 %538, i32* %ul_r, align 4 + %539 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %540 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %541 = call <4 x i8> @llvm.riscv.v.radd8.v4i8(<4 x i8> %539, <4 x i8> %540) + store volatile <4 x i8> %541, <4 x i8>* %i8x4_r, align 4 + %542 = load i32, i32* %ul_a, align 4 + %543 = load i32, i32* %ul_b, align 4 + %544 = call i32 @llvm.riscv.radd16.i32(i32 %542, i32 %543) + store volatile i32 %544, i32* %ul_r, align 4 + %545 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %546 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %547 = call <2 x i16> @llvm.riscv.v.radd16.v2i16(<2 x i16> %545, <2 x i16> %546) + store volatile <2 x i16> %547, <2 x i16>* %i16x2_r, align 4 + %548 = load i32, i32* %i_a, align 4 + %549 = load i32, i32* %i_b, align 4 + %550 = call i32 @llvm.riscv.raddw.i32(i32 %548, i32 %549) + store volatile i32 %550, i32* %l_r, align 4 + %551 = load i32, i32* %ul_a, align 4 + %552 = load i32, i32* %ul_b, align 4 + %553 = call i32 @llvm.riscv.rcras16.i32(i32 %551, i32 %552) + store volatile i32 %553, i32* %ul_r, align 4 + %554 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %555 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %556 = call <2 x i16> @llvm.riscv.v.rcras16.v2i16(<2 x i16> %554, <2 x i16> %555) + store volatile <2 x i16> %556, <2 x i16>* %i16x2_r, align 4 + %557 = load i32, i32* %ul_a, align 4 + %558 = load i32, i32* %ul_b, align 4 + %559 = call i32 @llvm.riscv.rcrsa16.i32(i32 %557, i32 %558) + store volatile i32 %559, i32* %ul_r, align 4 + %560 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %561 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %562 = call <2 x i16> @llvm.riscv.v.rcrsa16.v2i16(<2 x i16> %560, <2 x i16> %561) + store volatile <2 x i16> %562, <2 x i16>* %i16x2_r, align 4 + %563 = load i32, i32* %ul_a, align 4 + %564 = load i32, i32* %ul_b, align 4 + %565 = call i32 @llvm.riscv.rstas16.i32(i32 %563, i32 %564) + store volatile i32 %565, i32* %ul_r, align 4 + %566 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %567 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %568 = call <2 x i16> @llvm.riscv.v.rstas16.v2i16(<2 x i16> %566, <2 x i16> %567) + store volatile <2 x i16> %568, <2 x i16>* %i16x2_r, align 4 + %569 = load i32, i32* %ul_a, align 4 + %570 = load i32, i32* %ul_b, align 4 + %571 = call i32 @llvm.riscv.rstsa16.i32(i32 %569, i32 %570) + store volatile i32 %571, i32* %ul_r, align 4 + %572 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %573 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %574 = call <2 x i16> @llvm.riscv.v.rstsa16.v2i16(<2 x i16> %572, <2 x i16> %573) + store volatile <2 x i16> %574, <2 x i16>* %i16x2_r, align 4 + %575 = load i32, i32* %ul_a, align 4 + %576 = load i32, i32* %ul_b, align 4 + %577 = call i32 @llvm.riscv.rsub8.i32(i32 %575, i32 %576) + store volatile i32 %577, i32* %ul_r, align 4 + %578 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %579 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %580 = call <4 x i8> @llvm.riscv.v.rsub8.v4i8(<4 x i8> %578, <4 x i8> %579) + store volatile <4 x i8> %580, <4 x i8>* %i8x4_r, align 4 + %581 = load i32, i32* %ul_a, align 4 + %582 = load i32, i32* %ul_b, align 4 + %583 = call i32 @llvm.riscv.rsub16.i32(i32 %581, i32 %582) + store volatile i32 %583, i32* %ul_r, align 4 + %584 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %585 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %586 = call <2 x i16> @llvm.riscv.v.rsub16.v2i16(<2 x i16> %584, <2 x i16> %585) + store volatile <2 x i16> %586, <2 x i16>* %i16x2_r, align 4 + %587 = load i32, i32* %i_a, align 4 + %588 = load i32, i32* %i_b, align 4 + %589 = call i32 @llvm.riscv.rsubw.i32(i32 %587, i32 %588) + store volatile i32 %589, i32* %l_r, align 4 + %590 = load i32, i32* %ul_a, align 4 + %591 = call i32 @llvm.riscv.sclip8.i32(i32 %590, i32 5) + store volatile i32 %591, i32* %ul_r, align 4 + %592 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %593 = call <4 x i8> @llvm.riscv.v.sclip8.v4i8.i32(<4 x i8> %592, i32 5) + store volatile <4 x i8> %593, <4 x i8>* %i8x4_r, align 4 + %594 = load i32, i32* %ul_a, align 4 + %595 = call i32 @llvm.riscv.sclip16.i32(i32 %594, i32 6) + store volatile i32 %595, i32* %ul_r, align 4 + %596 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %597 = call <2 x i16> @llvm.riscv.v.sclip16.v2i16.i32(<2 x i16> %596, i32 6) + store volatile <2 x i16> %597, <2 x i16>* %i16x2_r, align 4 + %598 = load i32, i32* %l_a, align 4 + %599 = call i32 @llvm.riscv.sclip32.i32(i32 %598, i32 7) + store volatile i32 %599, i32* %l_r, align 4 + %600 = load i32, i32* %ul_a, align 4 + %601 = load i32, i32* %ul_b, align 4 + %602 = call i32 @llvm.riscv.scmple8.i32(i32 %600, i32 %601) + store volatile i32 %602, i32* %ul_r, align 4 + %603 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %604 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %605 = call <4 x i8> @llvm.riscv.v.scmple8.v4i8(<4 x i8> %603, <4 x i8> %604) + store volatile <4 x i8> %605, <4 x i8>* %u8x4_r, align 4 + %606 = load i32, i32* %ul_a, align 4 + %607 = load i32, i32* %ul_b, align 4 + %608 = call i32 @llvm.riscv.scmple16.i32(i32 %606, i32 %607) + store volatile i32 %608, i32* %ul_r, align 4 + %609 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %610 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %611 = call <2 x i16> @llvm.riscv.v.scmple16.v2i16(<2 x i16> %609, <2 x i16> %610) + store volatile <2 x i16> %611, <2 x i16>* %u16x2_r, align 4 + %612 = load i32, i32* %ul_a, align 4 + %613 = load i32, i32* %ul_b, align 4 + %614 = call i32 @llvm.riscv.scmplt8.i32(i32 %612, i32 %613) + store volatile i32 %614, i32* %ul_r, align 4 + %615 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %616 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %617 = call <4 x i8> @llvm.riscv.v.scmplt8.v4i8(<4 x i8> %615, <4 x i8> %616) + store volatile <4 x i8> %617, <4 x i8>* %u8x4_r, align 4 + %618 = load i32, i32* %ul_a, align 4 + %619 = load i32, i32* %ul_b, align 4 + %620 = call i32 @llvm.riscv.scmplt16.i32(i32 %618, i32 %619) + store volatile i32 %620, i32* %ul_r, align 4 + %621 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %622 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %623 = call <2 x i16> @llvm.riscv.v.scmplt16.v2i16(<2 x i16> %621, <2 x i16> %622) + store volatile <2 x i16> %623, <2 x i16>* %u16x2_r, align 4 + %624 = load i32, i32* %ul_a, align 4 + %625 = load i32, i32* %ui_b, align 4 + %626 = call i32 @llvm.riscv.sll8.i32(i32 %624, i32 %625) + store volatile i32 %626, i32* %ul_r, align 4 + %627 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %628 = load i32, i32* %ui_b, align 4 + %629 = call <4 x i8> @llvm.riscv.v.sll8.v4i8.i32(<4 x i8> %627, i32 %628) + store volatile <4 x i8> %629, <4 x i8>* %u8x4_r, align 4 + %630 = load i32, i32* %ul_a, align 4 + %631 = load i32, i32* %ui_b, align 4 + %632 = call i32 @llvm.riscv.sll16.i32(i32 %630, i32 %631) + store volatile i32 %632, i32* %ul_r, align 4 + %633 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %634 = load i32, i32* %ui_b, align 4 + %635 = call <2 x i16> @llvm.riscv.v.sll16.v2i16.i32(<2 x i16> %633, i32 %634) + store volatile <2 x i16> %635, <2 x i16>* %u16x2_r, align 4 + %636 = load i32, i32* %l_t, align 4 + %637 = load i32, i32* %ul_a, align 4 + %638 = load i32, i32* %ul_b, align 4 + %639 = call i32 @llvm.riscv.smaqa.i32(i32 %636, i32 %637, i32 %638) + store volatile i32 %639, i32* %l_r, align 4 + %640 = load i32, i32* %i_t, align 4 + %641 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %642 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %643 = call i32 @llvm.riscv.v.smaqa.i32.v4i8(i32 %640, <4 x i8> %641, <4 x i8> %642) + store volatile i32 %643, i32* %i_r, align 4 + %644 = load i32, i32* %l_t, align 4 + %645 = load i32, i32* %ul_a, align 4 + %646 = load i32, i32* %ul_b, align 4 + %647 = call i32 @llvm.riscv.smaqa.su.i32(i32 %644, i32 %645, i32 %646) + store volatile i32 %647, i32* %l_r, align 4 + %648 = load i32, i32* %i_t, align 4 + %649 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %650 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %651 = call i32 @llvm.riscv.v.smaqa.su.i32.v4i8(i32 %648, <4 x i8> %649, <4 x i8> %650) + store volatile i32 %651, i32* %i_r, align 4 + %652 = load i32, i32* %ul_a, align 4 + %653 = load i32, i32* %ul_b, align 4 + %654 = call i32 @llvm.riscv.smax8.i32(i32 %652, i32 %653) + store volatile i32 %654, i32* %ul_r, align 4 + %655 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %656 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %657 = call <4 x i8> @llvm.riscv.v.smax8.v4i8(<4 x i8> %655, <4 x i8> %656) + store volatile <4 x i8> %657, <4 x i8>* %i8x4_r, align 4 + %658 = load i32, i32* %ul_a, align 4 + %659 = load i32, i32* %ul_b, align 4 + %660 = call i32 @llvm.riscv.smax16.i32(i32 %658, i32 %659) + store volatile i32 %660, i32* %ul_r, align 4 + %661 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %662 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %663 = call <2 x i16> @llvm.riscv.v.smax16.v2i16(<2 x i16> %661, <2 x i16> %662) + store volatile <2 x i16> %663, <2 x i16>* %i16x2_r, align 4 + %664 = load i32, i32* %ul_a, align 4 + %665 = load i32, i32* %ul_b, align 4 + %666 = call i32 @llvm.riscv.smbb16.i32(i32 %664, i32 %665) + store volatile i32 %666, i32* %l_r, align 4 + %667 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %668 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %669 = call i32 @llvm.riscv.v.smbb16.i32.v2i16(<2 x i16> %667, <2 x i16> %668) + store volatile i32 %669, i32* %i_r, align 4 + %670 = load i32, i32* %ul_a, align 4 + %671 = load i32, i32* %ul_b, align 4 + %672 = call i32 @llvm.riscv.smbt16.i32(i32 %670, i32 %671) + store volatile i32 %672, i32* %l_r, align 4 + %673 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %674 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %675 = call i32 @llvm.riscv.v.smbt16.i32.v2i16(<2 x i16> %673, <2 x i16> %674) + store volatile i32 %675, i32* %i_r, align 4 + %676 = load i32, i32* %ul_a, align 4 + %677 = load i32, i32* %ul_b, align 4 + %678 = call i32 @llvm.riscv.smtt16.i32(i32 %676, i32 %677) + store volatile i32 %678, i32* %l_r, align 4 + %679 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %680 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %681 = call i32 @llvm.riscv.v.smtt16.i32.v2i16(<2 x i16> %679, <2 x i16> %680) + store volatile i32 %681, i32* %i_r, align 4 + %682 = load i32, i32* %ul_a, align 4 + %683 = load i32, i32* %ul_b, align 4 + %684 = call i32 @llvm.riscv.smds.i32(i32 %682, i32 %683) + store volatile i32 %684, i32* %l_r, align 4 + %685 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %686 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %687 = call i32 @llvm.riscv.v.smds.i32.v2i16(<2 x i16> %685, <2 x i16> %686) + store volatile i32 %687, i32* %i_r, align 4 + %688 = load i32, i32* %ul_a, align 4 + %689 = load i32, i32* %ul_b, align 4 + %690 = call i32 @llvm.riscv.smdrs.i32(i32 %688, i32 %689) + store volatile i32 %690, i32* %l_r, align 4 + %691 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %692 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %693 = call i32 @llvm.riscv.v.smdrs.i32.v2i16(<2 x i16> %691, <2 x i16> %692) + store volatile i32 %693, i32* %i_r, align 4 + %694 = load i32, i32* %ul_a, align 4 + %695 = load i32, i32* %ul_b, align 4 + %696 = call i32 @llvm.riscv.smxds.i32(i32 %694, i32 %695) + store volatile i32 %696, i32* %l_r, align 4 + %697 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %698 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %699 = call i32 @llvm.riscv.v.smxds.i32.v2i16(<2 x i16> %697, <2 x i16> %698) + store volatile i32 %699, i32* %i_r, align 4 + %700 = load i32, i32* %ul_a, align 4 + %701 = load i32, i32* %ul_b, align 4 + %702 = call i32 @llvm.riscv.smin8.i32(i32 %700, i32 %701) + store volatile i32 %702, i32* %ul_r, align 4 + %703 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %704 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %705 = call <4 x i8> @llvm.riscv.v.smin8.v4i8(<4 x i8> %703, <4 x i8> %704) + store volatile <4 x i8> %705, <4 x i8>* %i8x4_r, align 4 + %706 = load i32, i32* %ul_a, align 4 + %707 = load i32, i32* %ul_b, align 4 + %708 = call i32 @llvm.riscv.smin16.i32(i32 %706, i32 %707) + store volatile i32 %708, i32* %ul_r, align 4 + %709 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %710 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %711 = call <2 x i16> @llvm.riscv.v.smin16.v2i16(<2 x i16> %709, <2 x i16> %710) + store volatile <2 x i16> %711, <2 x i16>* %i16x2_r, align 4 + %712 = load i32, i32* %l_a, align 4 + %713 = load i32, i32* %l_b, align 4 + %714 = call i32 @llvm.riscv.smmul.i32(i32 %712, i32 %713) + store volatile i32 %714, i32* %l_r, align 4 + %715 = load i32, i32* %l_a, align 4 + %716 = load i32, i32* %l_b, align 4 + %717 = call i32 @llvm.riscv.smmul.u.i32(i32 %715, i32 %716) + store volatile i32 %717, i32* %l_r, align 4 + %718 = load i32, i32* %l_a, align 4 + %719 = load i32, i32* %ul_b, align 4 + %720 = call i32 @llvm.riscv.smmwb.i32(i32 %718, i32 %719) + store volatile i32 %720, i32* %l_r, align 4 + %721 = load i32, i32* %i_a, align 4 + %722 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %723 = call i32 @llvm.riscv.v.smmwb.i32.v2i16(i32 %721, <2 x i16> %722) + store volatile i32 %723, i32* %i_r, align 4 + %724 = load i32, i32* %l_a, align 4 + %725 = load i32, i32* %ul_b, align 4 + %726 = call i32 @llvm.riscv.smmwb.u.i32(i32 %724, i32 %725) + store volatile i32 %726, i32* %l_r, align 4 + %727 = load i32, i32* %i_a, align 4 + %728 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %729 = call i32 @llvm.riscv.v.smmwb.u.i32.v2i16(i32 %727, <2 x i16> %728) + store volatile i32 %729, i32* %i_r, align 4 + %730 = load i32, i32* %l_a, align 4 + %731 = load i32, i32* %ul_b, align 4 + %732 = call i32 @llvm.riscv.smmwt.i32(i32 %730, i32 %731) + store volatile i32 %732, i32* %l_r, align 4 + %733 = load i32, i32* %i_a, align 4 + %734 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %735 = call i32 @llvm.riscv.v.smmwt.i32.v2i16(i32 %733, <2 x i16> %734) + store volatile i32 %735, i32* %i_r, align 4 + %736 = load i32, i32* %l_a, align 4 + %737 = load i32, i32* %ul_b, align 4 + %738 = call i32 @llvm.riscv.smmwt.u.i32(i32 %736, i32 %737) + store volatile i32 %738, i32* %l_r, align 4 + %739 = load i32, i32* %i_a, align 4 + %740 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %741 = call i32 @llvm.riscv.v.smmwt.u.i32.v2i16(i32 %739, <2 x i16> %740) + store volatile i32 %741, i32* %i_r, align 4 + %742 = load i32, i32* %l_a, align 4 + %743 = load i32, i32* %ui_b, align 4 + %744 = call i32 @llvm.riscv.sra.u.i32(i32 %742, i32 %743) + store volatile i32 %744, i32* %l_r, align 4 + %745 = load i32, i32* %ul_a, align 4 + %746 = load i32, i32* %ui_b, align 4 + %747 = call i32 @llvm.riscv.sra8.i32(i32 %745, i32 %746) + store volatile i32 %747, i32* %ul_r, align 4 + %748 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %749 = load i32, i32* %ui_b, align 4 + %750 = call <4 x i8> @llvm.riscv.v.sra8.v4i8.i32(<4 x i8> %748, i32 %749) + store volatile <4 x i8> %750, <4 x i8>* %i8x4_r, align 4 + %751 = load i32, i32* %ul_a, align 4 + %752 = load i32, i32* %ui_b, align 4 + %753 = call i32 @llvm.riscv.sra8.u.i32(i32 %751, i32 %752) + store volatile i32 %753, i32* %ul_r, align 4 + %754 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %755 = load i32, i32* %ui_b, align 4 + %756 = call <4 x i8> @llvm.riscv.v.sra8.u.v4i8.i32(<4 x i8> %754, i32 %755) + store volatile <4 x i8> %756, <4 x i8>* %i8x4_r, align 4 + %757 = load i32, i32* %ul_a, align 4 + %758 = load i32, i32* %ui_b, align 4 + %759 = call i32 @llvm.riscv.sra16.i32(i32 %757, i32 %758) + store volatile i32 %759, i32* %ul_r, align 4 + %760 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %761 = load i32, i32* %ui_b, align 4 + %762 = call <2 x i16> @llvm.riscv.v.sra16.v2i16.i32(<2 x i16> %760, i32 %761) + store volatile <2 x i16> %762, <2 x i16>* %i16x2_r, align 4 + %763 = load i32, i32* %ul_a, align 4 + %764 = load i32, i32* %ui_b, align 4 + %765 = call i32 @llvm.riscv.sra16.u.i32(i32 %763, i32 %764) + store volatile i32 %765, i32* %ul_r, align 4 + %766 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %767 = load i32, i32* %ui_b, align 4 + %768 = call <2 x i16> @llvm.riscv.v.sra16.u.v2i16.i32(<2 x i16> %766, i32 %767) + store volatile <2 x i16> %768, <2 x i16>* %i16x2_r, align 4 + %769 = load i32, i32* %ul_a, align 4 + %770 = load i32, i32* %ui_b, align 4 + %771 = call i32 @llvm.riscv.srl8.i32(i32 %769, i32 %770) + store volatile i32 %771, i32* %ul_r, align 4 + %772 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %773 = load i32, i32* %ui_b, align 4 + %774 = call <4 x i8> @llvm.riscv.v.srl8.v4i8.i32(<4 x i8> %772, i32 %773) + store volatile <4 x i8> %774, <4 x i8>* %u8x4_r, align 4 + %775 = load i32, i32* %ul_a, align 4 + %776 = load i32, i32* %ui_b, align 4 + %777 = call i32 @llvm.riscv.srl8.u.i32(i32 %775, i32 %776) + store volatile i32 %777, i32* %ul_r, align 4 + %778 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %779 = load i32, i32* %ui_b, align 4 + %780 = call <4 x i8> @llvm.riscv.v.srl8.u.v4i8.i32(<4 x i8> %778, i32 %779) + store volatile <4 x i8> %780, <4 x i8>* %u8x4_r, align 4 + %781 = load i32, i32* %ul_a, align 4 + %782 = load i32, i32* %ui_b, align 4 + %783 = call i32 @llvm.riscv.srl16.i32(i32 %781, i32 %782) + store volatile i32 %783, i32* %ul_r, align 4 + %784 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %785 = load i32, i32* %ui_b, align 4 + %786 = call <2 x i16> @llvm.riscv.v.srl16.v2i16.i32(<2 x i16> %784, i32 %785) + store volatile <2 x i16> %786, <2 x i16>* %u16x2_r, align 4 + %787 = load i32, i32* %ul_a, align 4 + %788 = load i32, i32* %ui_b, align 4 + %789 = call i32 @llvm.riscv.srl16.u.i32(i32 %787, i32 %788) + store volatile i32 %789, i32* %ul_r, align 4 + %790 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %791 = load i32, i32* %ui_b, align 4 + %792 = call <2 x i16> @llvm.riscv.v.srl16.u.v2i16.i32(<2 x i16> %790, i32 %791) + store volatile <2 x i16> %792, <2 x i16>* %u16x2_r, align 4 + %793 = load i32, i32* %ul_a, align 4 + %794 = load i32, i32* %ul_b, align 4 + %795 = call i32 @llvm.riscv.stas16.i32(i32 %793, i32 %794) + store volatile i32 %795, i32* %ul_r, align 4 + %796 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %797 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %798 = call <2 x i16> @llvm.riscv.v.stas16.v2i16(<2 x i16> %796, <2 x i16> %797) + store volatile <2 x i16> %798, <2 x i16>* %u16x2_r, align 4 + %799 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %800 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %801 = call <2 x i16> @llvm.riscv.v.stas16.v2i16(<2 x i16> %799, <2 x i16> %800) + store volatile <2 x i16> %801, <2 x i16>* %i16x2_r, align 4 + %802 = load i32, i32* %ul_a, align 4 + %803 = load i32, i32* %ul_b, align 4 + %804 = call i32 @llvm.riscv.stsa16.i32(i32 %802, i32 %803) + store volatile i32 %804, i32* %ul_r, align 4 + %805 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %806 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %807 = call <2 x i16> @llvm.riscv.v.stsa16.v2i16(<2 x i16> %805, <2 x i16> %806) + store volatile <2 x i16> %807, <2 x i16>* %u16x2_r, align 4 + %808 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %809 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %810 = call <2 x i16> @llvm.riscv.v.stsa16.v2i16(<2 x i16> %808, <2 x i16> %809) + store volatile <2 x i16> %810, <2 x i16>* %i16x2_r, align 4 + %811 = load i32, i32* %ul_a, align 4 + %812 = load i32, i32* %ul_b, align 4 + %813 = call i32 @llvm.riscv.sub8.i32(i32 %811, i32 %812) + store volatile i32 %813, i32* %ul_r, align 4 + %814 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %815 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %816 = call <4 x i8> @llvm.riscv.v.sub8.v4i8(<4 x i8> %814, <4 x i8> %815) + store volatile <4 x i8> %816, <4 x i8>* %u8x4_r, align 4 + %817 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %818 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %819 = call <4 x i8> @llvm.riscv.v.sub8.v4i8(<4 x i8> %817, <4 x i8> %818) + store volatile <4 x i8> %819, <4 x i8>* %i8x4_r, align 4 + %820 = load i32, i32* %ul_a, align 4 + %821 = load i32, i32* %ul_b, align 4 + %822 = call i32 @llvm.riscv.sub16.i32(i32 %820, i32 %821) + store volatile i32 %822, i32* %ul_r, align 4 + %823 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %824 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %825 = call <2 x i16> @llvm.riscv.v.sub16.v2i16(<2 x i16> %823, <2 x i16> %824) + store volatile <2 x i16> %825, <2 x i16>* %u16x2_r, align 4 + %826 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %827 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %828 = call <2 x i16> @llvm.riscv.v.sub16.v2i16(<2 x i16> %826, <2 x i16> %827) + store volatile <2 x i16> %828, <2 x i16>* %i16x2_r, align 4 + %829 = load i32, i32* %ul_a, align 4 + %830 = call i32 @llvm.riscv.sunpkd810.i32(i32 %829) + store volatile i32 %830, i32* %ul_r, align 4 + %831 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %832 = call <2 x i16> @llvm.riscv.v.sunpkd810.v2i16(<4 x i8> %831) + store volatile <2 x i16> %832, <2 x i16>* %i16x2_r, align 4 + %833 = load i32, i32* %ul_a, align 4 + %834 = call i32 @llvm.riscv.sunpkd820.i32(i32 %833) + store volatile i32 %834, i32* %ul_r, align 4 + %835 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %836 = call <2 x i16> @llvm.riscv.v.sunpkd820.v2i16(<4 x i8> %835) + store volatile <2 x i16> %836, <2 x i16>* %i16x2_r, align 4 + %837 = load i32, i32* %ul_a, align 4 + %838 = call i32 @llvm.riscv.sunpkd830.i32(i32 %837) + store volatile i32 %838, i32* %ul_r, align 4 + %839 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %840 = call <2 x i16> @llvm.riscv.v.sunpkd830.v2i16(<4 x i8> %839) + store volatile <2 x i16> %840, <2 x i16>* %i16x2_r, align 4 + %841 = load i32, i32* %ul_a, align 4 + %842 = call i32 @llvm.riscv.sunpkd831.i32(i32 %841) + store volatile i32 %842, i32* %ul_r, align 4 + %843 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %844 = call <2 x i16> @llvm.riscv.v.sunpkd831.v2i16(<4 x i8> %843) + store volatile <2 x i16> %844, <2 x i16>* %i16x2_r, align 4 + %845 = load i32, i32* %ul_a, align 4 + %846 = call i32 @llvm.riscv.sunpkd832.i32(i32 %845) + store volatile i32 %846, i32* %ul_r, align 4 + %847 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %848 = call <2 x i16> @llvm.riscv.v.sunpkd832.v2i16(<4 x i8> %847) + store volatile <2 x i16> %848, <2 x i16>* %i16x2_r, align 4 + %849 = load i32, i32* %ul_a, align 4 + %850 = call i32 @llvm.riscv.swap8.i32(i32 %849) + store volatile i32 %850, i32* %ul_r, align 4 + %851 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %852 = call <4 x i8> @llvm.riscv.v.swap8.v4i8(<4 x i8> %851) + store volatile <4 x i8> %852, <4 x i8>* %u8x4_r, align 4 + %853 = load i32, i32* %ul_a, align 4 + %854 = call i32 @llvm.riscv.swap16.i32(i32 %853) + store volatile i32 %854, i32* %ul_r, align 4 + %855 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %856 = call <2 x i16> @llvm.riscv.v.swap16.v2i16(<2 x i16> %855) + store volatile <2 x i16> %856, <2 x i16>* %u16x2_r, align 4 + %857 = load i32, i32* %ul_a, align 4 + %858 = call i32 @llvm.riscv.uclip8.i32(i32 %857, i32 5) + store volatile i32 %858, i32* %ul_r, align 4 + %859 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %860 = call <4 x i8> @llvm.riscv.v.uclip8.v4i8.i32(<4 x i8> %859, i32 5) + store volatile <4 x i8> %860, <4 x i8>* %u8x4_r, align 4 + %861 = load i32, i32* %ul_a, align 4 + %862 = call i32 @llvm.riscv.uclip16.i32(i32 %861, i32 6) + store volatile i32 %862, i32* %ul_r, align 4 + %863 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %864 = call <2 x i16> @llvm.riscv.v.uclip16.v2i16.i32(<2 x i16> %863, i32 6) + store volatile <2 x i16> %864, <2 x i16>* %u16x2_r, align 4 + %865 = load i32, i32* %l_a, align 4 + %866 = call i32 @llvm.riscv.uclip32.i32(i32 %865, i32 7) + store volatile i32 %866, i32* %l_r, align 4 + %867 = load i32, i32* %ul_a, align 4 + %868 = load i32, i32* %ul_b, align 4 + %869 = call i32 @llvm.riscv.ucmple8.i32(i32 %867, i32 %868) + store volatile i32 %869, i32* %ul_r, align 4 + %870 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %871 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %872 = call <4 x i8> @llvm.riscv.v.ucmple8.v4i8(<4 x i8> %870, <4 x i8> %871) + store volatile <4 x i8> %872, <4 x i8>* %u8x4_r, align 4 + %873 = load i32, i32* %ul_a, align 4 + %874 = load i32, i32* %ul_b, align 4 + %875 = call i32 @llvm.riscv.ucmple16.i32(i32 %873, i32 %874) + store volatile i32 %875, i32* %ul_r, align 4 + %876 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %877 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %878 = call <2 x i16> @llvm.riscv.v.ucmple16.v2i16(<2 x i16> %876, <2 x i16> %877) + store volatile <2 x i16> %878, <2 x i16>* %u16x2_r, align 4 + %879 = load i32, i32* %ul_a, align 4 + %880 = load i32, i32* %ul_b, align 4 + %881 = call i32 @llvm.riscv.ucmplt8.i32(i32 %879, i32 %880) + store volatile i32 %881, i32* %ul_r, align 4 + %882 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %883 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %884 = call <4 x i8> @llvm.riscv.v.ucmplt8.v4i8(<4 x i8> %882, <4 x i8> %883) + store volatile <4 x i8> %884, <4 x i8>* %u8x4_r, align 4 + %885 = load i32, i32* %ul_a, align 4 + %886 = load i32, i32* %ul_b, align 4 + %887 = call i32 @llvm.riscv.ucmplt16.i32(i32 %885, i32 %886) + store volatile i32 %887, i32* %ul_r, align 4 + %888 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %889 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %890 = call <2 x i16> @llvm.riscv.v.ucmplt16.v2i16(<2 x i16> %888, <2 x i16> %889) + store volatile <2 x i16> %890, <2 x i16>* %u16x2_r, align 4 + %891 = load i32, i32* %ul_a, align 4 + %892 = load i32, i32* %ul_b, align 4 + %893 = call i32 @llvm.riscv.ukadd8.i32(i32 %891, i32 %892) + store volatile i32 %893, i32* %ul_r, align 4 + %894 = load <4 x i8>, <4 x i8>* %i8x4_a, align 4 + %895 = load <4 x i8>, <4 x i8>* %i8x4_b, align 4 + %896 = call <4 x i8> @llvm.riscv.v.ukadd8.v4i8(<4 x i8> %894, <4 x i8> %895) + store volatile <4 x i8> %896, <4 x i8>* %i8x4_r, align 4 + %897 = load i32, i32* %ul_a, align 4 + %898 = load i32, i32* %ul_b, align 4 + %899 = call i32 @llvm.riscv.ukadd16.i32(i32 %897, i32 %898) + store volatile i32 %899, i32* %ul_r, align 4 + %900 = load <2 x i16>, <2 x i16>* %i16x2_a, align 4 + %901 = load <2 x i16>, <2 x i16>* %i16x2_b, align 4 + %902 = call <2 x i16> @llvm.riscv.v.ukadd16.v2i16(<2 x i16> %900, <2 x i16> %901) + store volatile <2 x i16> %902, <2 x i16>* %i16x2_r, align 4 + %903 = load i32, i32* %i_a, align 4 + %904 = load i32, i32* %i_b, align 4 + %905 = call i32 @llvm.riscv.ukaddh.i32(i32 %903, i32 %904) + store volatile i32 %905, i32* %l_r, align 4 + %906 = load i32, i32* %i_a, align 4 + %907 = load i32, i32* %i_b, align 4 + %908 = call i32 @llvm.riscv.ukaddw.i32(i32 %906, i32 %907) + store volatile i32 %908, i32* %l_r, align 4 + %909 = load i32, i32* %ul_a, align 4 + %910 = load i32, i32* %ul_b, align 4 + %911 = call i32 @llvm.riscv.ukcras16.i32(i32 %909, i32 %910) + store volatile i32 %911, i32* %ul_r, align 4 + %912 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %913 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %914 = call <2 x i16> @llvm.riscv.v.ukcras16.v2i16(<2 x i16> %912, <2 x i16> %913) + store volatile <2 x i16> %914, <2 x i16>* %u16x2_r, align 4 + %915 = load i32, i32* %ul_a, align 4 + %916 = load i32, i32* %ul_b, align 4 + %917 = call i32 @llvm.riscv.ukcrsa16.i32(i32 %915, i32 %916) + store volatile i32 %917, i32* %ul_r, align 4 + %918 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %919 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %920 = call <2 x i16> @llvm.riscv.v.ukcrsa16.v2i16(<2 x i16> %918, <2 x i16> %919) + store volatile <2 x i16> %920, <2 x i16>* %u16x2_r, align 4 + %921 = load i32, i32* %ul_a, align 4 + %922 = load i32, i32* %ul_b, align 4 + %923 = call i32 @llvm.riscv.ukstas16.i32(i32 %921, i32 %922) + store volatile i32 %923, i32* %ul_r, align 4 + %924 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %925 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %926 = call <2 x i16> @llvm.riscv.v.ukstas16.v2i16(<2 x i16> %924, <2 x i16> %925) + store volatile <2 x i16> %926, <2 x i16>* %u16x2_r, align 4 + %927 = load i32, i32* %ul_a, align 4 + %928 = load i32, i32* %ul_b, align 4 + %929 = call i32 @llvm.riscv.ukstsa16.i32(i32 %927, i32 %928) + store volatile i32 %929, i32* %ul_r, align 4 + %930 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %931 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %932 = call <2 x i16> @llvm.riscv.v.ukstsa16.v2i16(<2 x i16> %930, <2 x i16> %931) + store volatile <2 x i16> %932, <2 x i16>* %u16x2_r, align 4 + %933 = load i32, i32* %ul_a, align 4 + %934 = load i32, i32* %ul_b, align 4 + %935 = call i32 @llvm.riscv.uksub8.i32(i32 %933, i32 %934) + store volatile i32 %935, i32* %ul_r, align 4 + %936 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %937 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %938 = call <4 x i8> @llvm.riscv.v.uksub8.v4i8(<4 x i8> %936, <4 x i8> %937) + store volatile <4 x i8> %938, <4 x i8>* %u8x4_r, align 4 + %939 = load i32, i32* %ul_a, align 4 + %940 = load i32, i32* %ul_b, align 4 + %941 = call i32 @llvm.riscv.uksub16.i32(i32 %939, i32 %940) + store volatile i32 %941, i32* %ul_r, align 4 + %942 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %943 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %944 = call <2 x i16> @llvm.riscv.v.uksub16.v2i16(<2 x i16> %942, <2 x i16> %943) + store volatile <2 x i16> %944, <2 x i16>* %u16x2_r, align 4 + %945 = load i32, i32* %ui_a, align 4 + %946 = load i32, i32* %ui_b, align 4 + %947 = call i32 @llvm.riscv.uksubh.i32(i32 %945, i32 %946) + store volatile i32 %947, i32* %ul_r, align 4 + %948 = load i32, i32* %ui_a, align 4 + %949 = load i32, i32* %ui_b, align 4 + %950 = call i32 @llvm.riscv.uksubw.i32(i32 %948, i32 %949) + store volatile i32 %950, i32* %ul_r, align 4 + %951 = load i32, i32* %ul_t, align 4 + %952 = load i32, i32* %ul_a, align 4 + %953 = load i32, i32* %ul_b, align 4 + %954 = call i32 @llvm.riscv.umaqa.i32(i32 %951, i32 %952, i32 %953) + store volatile i32 %954, i32* %ul_r, align 4 + %955 = load i32, i32* %ui_t, align 4 + %956 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %957 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %958 = call i32 @llvm.riscv.v.umaqa.i32.v4i8(i32 %955, <4 x i8> %956, <4 x i8> %957) + store volatile i32 %958, i32* %ui_r, align 4 + %959 = load i32, i32* %ul_a, align 4 + %960 = load i32, i32* %ul_b, align 4 + %961 = call i32 @llvm.riscv.umax8.i32(i32 %959, i32 %960) + store volatile i32 %961, i32* %ul_r, align 4 + %962 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %963 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %964 = call <4 x i8> @llvm.riscv.v.umax8.v4i8(<4 x i8> %962, <4 x i8> %963) + store volatile <4 x i8> %964, <4 x i8>* %u8x4_r, align 4 + %965 = load i32, i32* %ul_a, align 4 + %966 = load i32, i32* %ul_b, align 4 + %967 = call i32 @llvm.riscv.umax16.i32(i32 %965, i32 %966) + store volatile i32 %967, i32* %ul_r, align 4 + %968 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %969 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %970 = call <2 x i16> @llvm.riscv.v.umax16.v2i16(<2 x i16> %968, <2 x i16> %969) + store volatile <2 x i16> %970, <2 x i16>* %u16x2_r, align 4 + %971 = load i32, i32* %ul_a, align 4 + %972 = load i32, i32* %ul_b, align 4 + %973 = call i32 @llvm.riscv.umin8.i32(i32 %971, i32 %972) + store volatile i32 %973, i32* %ul_r, align 4 + %974 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %975 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %976 = call <4 x i8> @llvm.riscv.v.umin8.v4i8(<4 x i8> %974, <4 x i8> %975) + store volatile <4 x i8> %976, <4 x i8>* %u8x4_r, align 4 + %977 = load i32, i32* %ul_a, align 4 + %978 = load i32, i32* %ul_b, align 4 + %979 = call i32 @llvm.riscv.umin16.i32(i32 %977, i32 %978) + store volatile i32 %979, i32* %ul_r, align 4 + %980 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %981 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %982 = call <2 x i16> @llvm.riscv.v.umin16.v2i16(<2 x i16> %980, <2 x i16> %981) + store volatile <2 x i16> %982, <2 x i16>* %u16x2_r, align 4 + %983 = load i32, i32* %ul_a, align 4 + %984 = load i32, i32* %ul_b, align 4 + %985 = call i32 @llvm.riscv.uradd8.i32(i32 %983, i32 %984) + store volatile i32 %985, i32* %ul_r, align 4 + %986 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %987 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %988 = call <4 x i8> @llvm.riscv.v.uradd8.v4i8(<4 x i8> %986, <4 x i8> %987) + store volatile <4 x i8> %988, <4 x i8>* %u8x4_r, align 4 + %989 = load i32, i32* %ul_a, align 4 + %990 = load i32, i32* %ul_b, align 4 + %991 = call i32 @llvm.riscv.uradd16.i32(i32 %989, i32 %990) + store volatile i32 %991, i32* %ul_r, align 4 + %992 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %993 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %994 = call <2 x i16> @llvm.riscv.v.uradd16.v2i16(<2 x i16> %992, <2 x i16> %993) + store volatile <2 x i16> %994, <2 x i16>* %u16x2_r, align 4 + %995 = load i32, i32* %ui_a, align 4 + %996 = load i32, i32* %ui_b, align 4 + %997 = call i32 @llvm.riscv.uraddw.i32(i32 %995, i32 %996) + store volatile i32 %997, i32* %ul_r, align 4 + %998 = load i32, i32* %ul_a, align 4 + %999 = load i32, i32* %ul_b, align 4 + %1000 = call i32 @llvm.riscv.urcras16.i32(i32 %998, i32 %999) + store volatile i32 %1000, i32* %ul_r, align 4 + %1001 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %1002 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %1003 = call <2 x i16> @llvm.riscv.v.urcras16.v2i16(<2 x i16> %1001, <2 x i16> %1002) + store volatile <2 x i16> %1003, <2 x i16>* %u16x2_r, align 4 + %1004 = load i32, i32* %ul_a, align 4 + %1005 = load i32, i32* %ul_b, align 4 + %1006 = call i32 @llvm.riscv.urcrsa16.i32(i32 %1004, i32 %1005) + store volatile i32 %1006, i32* %ul_r, align 4 + %1007 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %1008 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %1009 = call <2 x i16> @llvm.riscv.v.urcrsa16.v2i16(<2 x i16> %1007, <2 x i16> %1008) + store volatile <2 x i16> %1009, <2 x i16>* %u16x2_r, align 4 + %1010 = load i32, i32* %ul_a, align 4 + %1011 = load i32, i32* %ul_b, align 4 + %1012 = call i32 @llvm.riscv.urstas16.i32(i32 %1010, i32 %1011) + store volatile i32 %1012, i32* %ul_r, align 4 + %1013 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %1014 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %1015 = call <2 x i16> @llvm.riscv.v.urstas16.v2i16(<2 x i16> %1013, <2 x i16> %1014) + store volatile <2 x i16> %1015, <2 x i16>* %u16x2_r, align 4 + %1016 = load i32, i32* %ul_a, align 4 + %1017 = load i32, i32* %ul_b, align 4 + %1018 = call i32 @llvm.riscv.urstsa16.i32(i32 %1016, i32 %1017) + store volatile i32 %1018, i32* %ul_r, align 4 + %1019 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %1020 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %1021 = call <2 x i16> @llvm.riscv.v.urstsa16.v2i16(<2 x i16> %1019, <2 x i16> %1020) + store volatile <2 x i16> %1021, <2 x i16>* %u16x2_r, align 4 + %1022 = load i32, i32* %ul_a, align 4 + %1023 = load i32, i32* %ul_b, align 4 + %1024 = call i32 @llvm.riscv.ursub8.i32(i32 %1022, i32 %1023) + store volatile i32 %1024, i32* %ul_r, align 4 + %1025 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1026 = load <4 x i8>, <4 x i8>* %u8x4_b, align 4 + %1027 = call <4 x i8> @llvm.riscv.v.ursub8.v4i8(<4 x i8> %1025, <4 x i8> %1026) + store volatile <4 x i8> %1027, <4 x i8>* %u8x4_r, align 4 + %1028 = load i32, i32* %ul_a, align 4 + %1029 = load i32, i32* %ul_b, align 4 + %1030 = call i32 @llvm.riscv.ursub16.i32(i32 %1028, i32 %1029) + store volatile i32 %1030, i32* %ul_r, align 4 + %1031 = load <2 x i16>, <2 x i16>* %u16x2_a, align 4 + %1032 = load <2 x i16>, <2 x i16>* %u16x2_b, align 4 + %1033 = call <2 x i16> @llvm.riscv.v.ursub16.v2i16(<2 x i16> %1031, <2 x i16> %1032) + store volatile <2 x i16> %1033, <2 x i16>* %u16x2_r, align 4 + %1034 = load i32, i32* %ui_a, align 4 + %1035 = load i32, i32* %ui_b, align 4 + %1036 = call i32 @llvm.riscv.ursubw.i32(i32 %1034, i32 %1035) + store volatile i32 %1036, i32* %ul_r, align 4 + %1037 = load i32, i32* %ul_a, align 4 + %1038 = call i32 @llvm.riscv.zunpkd810.i32(i32 %1037) + store volatile i32 %1038, i32* %ul_r, align 4 + %1039 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1040 = call <2 x i16> @llvm.riscv.v.zunpkd810.v2i16(<4 x i8> %1039) + store volatile <2 x i16> %1040, <2 x i16>* %u16x2_r, align 4 + %1041 = load i32, i32* %ul_a, align 4 + %1042 = call i32 @llvm.riscv.zunpkd820.i32(i32 %1041) + store volatile i32 %1042, i32* %ul_r, align 4 + %1043 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1044 = call <2 x i16> @llvm.riscv.v.zunpkd820.v2i16(<4 x i8> %1043) + store volatile <2 x i16> %1044, <2 x i16>* %u16x2_r, align 4 + %1045 = load i32, i32* %ul_a, align 4 + %1046 = call i32 @llvm.riscv.zunpkd830.i32(i32 %1045) + store volatile i32 %1046, i32* %ul_r, align 4 + %1047 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1048 = call <2 x i16> @llvm.riscv.v.zunpkd830.v2i16(<4 x i8> %1047) + store volatile <2 x i16> %1048, <2 x i16>* %u16x2_r, align 4 + %1049 = load i32, i32* %ul_a, align 4 + %1050 = call i32 @llvm.riscv.zunpkd831.i32(i32 %1049) + store volatile i32 %1050, i32* %ul_r, align 4 + %1051 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1052 = call <2 x i16> @llvm.riscv.v.zunpkd831.v2i16(<4 x i8> %1051) + store volatile <2 x i16> %1052, <2 x i16>* %u16x2_r, align 4 + %1053 = load i32, i32* %ul_a, align 4 + %1054 = call i32 @llvm.riscv.zunpkd832.i32(i32 %1053) + store volatile i32 %1054, i32* %ul_r, align 4 + %1055 = load <4 x i8>, <4 x i8>* %u8x4_a, align 4 + %1056 = call <2 x i16> @llvm.riscv.v.zunpkd832.v2i16(<4 x i8> %1055) + store volatile <2 x i16> %1056, <2 x i16>* %u16x2_r, align 4 + ret void +} + +declare i32 @llvm.riscv.add8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.add8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.add16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.add16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ave.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.bitrev.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.bpick.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.clrs8.i32(i32) nounwind + +declare <4 x i8> @llvm.riscv.v.clrs8.v4i8(<4 x i8>) nounwind + +declare i32 @llvm.riscv.clrs16.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.clrs16.v2i16(<2 x i16>) nounwind + +declare i32 @llvm.riscv.clrs32.i32(i32) nounwind + +declare i32 @llvm.riscv.clo8.i32(i32) nounwind + +declare <4 x i8> @llvm.riscv.v.clo8.v4i8(<4 x i8>) nounwind + +declare i32 @llvm.riscv.clo16.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.clo16.v2i16(<2 x i16>) nounwind + +declare i32 @llvm.riscv.clo32.i32(i32) nounwind + +declare i32 @llvm.riscv.clz8.i32(i32) nounwind + +declare <4 x i8> @llvm.riscv.v.clz8.v4i8(<4 x i8>) nounwind + +declare i32 @llvm.riscv.clz16.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.clz16.v2i16(<2 x i16>) nounwind + +declare i32 @llvm.riscv.clz32.i32(i32) nounwind + +declare i32 @llvm.riscv.cmpeq8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.cmpeq8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.cmpeq16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.cmpeq16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.cras16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.cras16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.crsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.crsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.insb.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.kabs8.i32(i32) nounwind + +declare <4 x i8> @llvm.riscv.v.kabs8.v4i8(<4 x i8>) nounwind + +declare i32 @llvm.riscv.kabs16.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kabs16.v2i16(<2 x i16>) nounwind + +declare i32 @llvm.riscv.kabsw.i32(i32) nounwind + +declare i32 @llvm.riscv.kadd8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.kadd8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.kadd16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kadd16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kaddh.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.kaddw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.kcras16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kcras16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kcrsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kcrsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmbb.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmbb.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmbt.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmbt.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmtt.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmtt.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmabb.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmabb.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmabt.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmabt.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kdmatt.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kdmatt.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.khm8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.khm8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.khmx8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.khmx8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.khm16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.khm16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.khmx16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.khmx16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.khmbb.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.khmbb.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.khmbt.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.khmbt.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.khmtt.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.khmtt.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmabb.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmabb.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmabt.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmabt.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmatt.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmatt.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmada.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmada.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmaxda.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmaxda.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmads.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmads.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmadrs.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmadrs.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmaxds.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmaxds.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmda.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmda.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmxda.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmxda.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmac.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.kmmac.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.kmmawb.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawb.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawb.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawb.u.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawb2.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawb2.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawb2.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawb2.u.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawt.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawt.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawt.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawt.u.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawt2.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawt2.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmawt2.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmawt2.u.i32.v2i16(i32, i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmsb.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.kmmsb.u.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.kmmwb2.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmwb2.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmwb2.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmwb2.u.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmwt2.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmwt2.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmmwt2.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmmwt2.u.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmsda.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmsda.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kmsxda.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.kmsxda.i32.v2i16(i32, <2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ksllw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.ksll8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ksll8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.ksll16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ksll16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.kslra8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.kslra8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.kslra8.u.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.kslra8.u.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.kslra16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kslra16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.kslra16.u.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kslra16.u.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.kstas16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kstas16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.kstsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.kstsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ksub8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ksub8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.ksub16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ksub16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ksubh.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.ksubw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.kwmmul.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.kwmmul.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.maxw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.minw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.pbsad.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.pbsad.i32.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.pbsada.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.pbsada.i32.v4i8(i32, <4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.pkbb16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.pkbb16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.pkbt16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.pkbt16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.pktt16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.pktt16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.pktb16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.pktb16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.radd8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.radd8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.radd16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.radd16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.raddw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.rcras16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.rcras16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.rcrsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.rcrsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.rstas16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.rstas16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.rstsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.rstsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.rsub8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.rsub8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.rsub16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.rsub16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.rsubw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.sclip8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.sclip8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.sclip16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sclip16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.sclip32.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.scmple8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.scmple8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.scmple16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.scmple16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.scmplt8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.scmplt8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.scmplt16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.scmplt16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.sll8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.sll8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.sll16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sll16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.smaqa.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.smaqa.i32.v4i8(i32, <4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.smaqa.su.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.smaqa.su.i32.v4i8(i32, <4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.smax8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.smax8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.smax16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.smax16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smbb16.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smbb16.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smbt16.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smbt16.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smtt16.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smtt16.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smds.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smds.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smdrs.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smdrs.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smxds.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smxds.i32.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smin8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.smin8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.smin16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.smin16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smmul.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.smmul.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.smmwb.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smmwb.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smmwb.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smmwb.u.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smmwt.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smmwt.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.smmwt.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.v.smmwt.u.i32.v2i16(i32, <2 x i16>) nounwind + +declare i32 @llvm.riscv.sra.u.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.sra8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.sra8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.sra8.u.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.sra8.u.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.sra16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sra16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.sra16.u.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sra16.u.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.srl8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.srl8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.srl8.u.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.srl8.u.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.srl16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.srl16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.srl16.u.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.srl16.u.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.stas16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.stas16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.stsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.stsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.sub8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.sub8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.sub16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sub16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.sunpkd810.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sunpkd810.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.sunpkd820.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sunpkd820.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.sunpkd830.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sunpkd830.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.sunpkd831.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sunpkd831.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.sunpkd832.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.sunpkd832.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.swap8.i32(i32) nounwind + +declare <4 x i8> @llvm.riscv.v.swap8.v4i8(<4 x i8>) nounwind + +declare i32 @llvm.riscv.swap16.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.swap16.v2i16(<2 x i16>) nounwind + +declare i32 @llvm.riscv.uclip8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.uclip8.v4i8.i32(<4 x i8>, i32) nounwind + +declare i32 @llvm.riscv.uclip16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.uclip16.v2i16.i32(<2 x i16>, i32) nounwind + +declare i32 @llvm.riscv.uclip32.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.ucmple8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ucmple8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.ucmple16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ucmple16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ucmplt8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ucmplt8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.ucmplt16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ucmplt16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ukadd8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ukadd8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.ukadd16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ukadd16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ukaddh.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.ukaddw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.ukcras16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ukcras16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ukcrsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ukcrsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ukstas16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ukstas16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ukstsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ukstsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.uksub8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.uksub8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.uksub16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.uksub16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.uksubh.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.uksubw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.umaqa.i32(i32, i32, i32) nounwind + +declare i32 @llvm.riscv.v.umaqa.i32.v4i8(i32, <4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.umax8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.umax8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.umax16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.umax16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.umin8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.umin8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.umin16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.umin16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.uradd8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.uradd8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.uradd16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.uradd16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.uraddw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.urcras16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.urcras16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.urcrsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.urcrsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.urstas16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.urstas16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.urstsa16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.urstsa16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ursub8.i32(i32, i32) nounwind + +declare <4 x i8> @llvm.riscv.v.ursub8.v4i8(<4 x i8>, <4 x i8>) nounwind + +declare i32 @llvm.riscv.ursub16.i32(i32, i32) nounwind + +declare <2 x i16> @llvm.riscv.v.ursub16.v2i16(<2 x i16>, <2 x i16>) nounwind + +declare i32 @llvm.riscv.ursubw.i32(i32, i32) nounwind + +declare i32 @llvm.riscv.zunpkd810.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.zunpkd810.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.zunpkd820.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.zunpkd820.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.zunpkd830.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.zunpkd830.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.zunpkd831.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.zunpkd831.v2i16(<4 x i8>) nounwind + +declare i32 @llvm.riscv.zunpkd832.i32(i32) nounwind + +declare <2 x i16> @llvm.riscv.v.zunpkd832.v2i16(<4 x i8>) nounwind diff --git a/llvm/test/CodeGen/RISCV/intrinsics-rv64p.ll b/llvm/test/CodeGen/RISCV/intrinsics-rv64p.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/intrinsics-rv64p.ll @@ -0,0 +1,3879 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64P %s + +define void @test() nounwind { +; RV64P-LABEL: test: +; RV64P: # %bb.0: # %entry +; RV64P-NEXT: addi sp, sp, -352 +; RV64P-NEXT: sw zero, 348(sp) +; RV64P-NEXT: addi a0, zero, 1 +; RV64P-NEXT: sw a0, 344(sp) +; RV64P-NEXT: sw zero, 340(sp) +; RV64P-NEXT: sw a0, 336(sp) +; RV64P-NEXT: sd zero, 328(sp) +; RV64P-NEXT: sd zero, 320(sp) +; RV64P-NEXT: sd a0, 312(sp) +; RV64P-NEXT: sd zero, 296(sp) +; RV64P-NEXT: sd a0, 288(sp) +; RV64P-NEXT: addi a1, zero, 2 +; RV64P-NEXT: sd a1, 280(sp) +; RV64P-NEXT: addi a2, zero, 3 +; RV64P-NEXT: sd a2, 272(sp) +; RV64P-NEXT: sd zero, 256(sp) +; RV64P-NEXT: sd a0, 248(sp) +; RV64P-NEXT: sd a1, 240(sp) +; RV64P-NEXT: sd zero, 224(sp) +; RV64P-NEXT: sd a0, 216(sp) +; RV64P-NEXT: sd a1, 208(sp) +; RV64P-NEXT: sb a2, 199(sp) +; RV64P-NEXT: sb a1, 198(sp) +; RV64P-NEXT: sb a0, 197(sp) +; RV64P-NEXT: sb zero, 196(sp) +; RV64P-NEXT: sb a2, 195(sp) +; RV64P-NEXT: sb a1, 194(sp) +; RV64P-NEXT: sb a0, 193(sp) +; RV64P-NEXT: sb zero, 192(sp) +; RV64P-NEXT: sb a2, 191(sp) +; RV64P-NEXT: sb a1, 190(sp) +; RV64P-NEXT: sb a0, 189(sp) +; RV64P-NEXT: sb zero, 188(sp) +; RV64P-NEXT: sb a2, 187(sp) +; RV64P-NEXT: lui a2, %hi(.LCPI0_0) +; RV64P-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64P-NEXT: sb a1, 186(sp) +; RV64P-NEXT: sb a0, 185(sp) +; RV64P-NEXT: sb zero, 184(sp) +; RV64P-NEXT: sd a2, 176(sp) +; RV64P-NEXT: sd a2, 168(sp) +; RV64P-NEXT: sd a2, 152(sp) +; RV64P-NEXT: sd a2, 144(sp) +; RV64P-NEXT: sh a0, 134(sp) +; RV64P-NEXT: sh zero, 132(sp) +; RV64P-NEXT: sh a0, 130(sp) +; RV64P-NEXT: sh zero, 128(sp) +; RV64P-NEXT: sh a0, 126(sp) +; RV64P-NEXT: lui a1, %hi(.LCPI0_1) +; RV64P-NEXT: ld a1, %lo(.LCPI0_1)(a1) +; RV64P-NEXT: sh zero, 124(sp) +; RV64P-NEXT: sh a0, 122(sp) +; RV64P-NEXT: sh zero, 120(sp) +; RV64P-NEXT: sd a1, 112(sp) +; RV64P-NEXT: lui a0, %hi(.LCPI0_2) +; RV64P-NEXT: ld a0, %lo(.LCPI0_2)(a0) +; RV64P-NEXT: sd a1, 104(sp) +; RV64P-NEXT: sd a1, 88(sp) +; RV64P-NEXT: sd a1, 80(sp) +; RV64P-NEXT: sd a0, 64(sp) +; RV64P-NEXT: sd a0, 56(sp) +; RV64P-NEXT: sd a0, 48(sp) +; RV64P-NEXT: sd a0, 32(sp) +; RV64P-NEXT: sd a0, 24(sp) +; RV64P-NEXT: sd a0, 16(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: add8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: add8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: add8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: add16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: add16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: add16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 312(sp) +; RV64P-NEXT: ave a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: bitrev a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ld a2, 272(sp) +; RV64P-NEXT: bpick a0, a0, a1, a2 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clrs8 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: clrs8 a0, a0 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clrs16 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: clrs16 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clrs32 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: clrs32 a0, a0 +; RV64P-NEXT: sd a0, 8(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clo8 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: clo8 a0, a0 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clo16 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: clo16 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clo32 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: clo32 a0, a0 +; RV64P-NEXT: sd a0, 8(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clz8 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: clz8 a0, a0 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clz16 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: clz16 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: clz32 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: clz32 a0, a0 +; RV64P-NEXT: sd a0, 8(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: cmpeq8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: cmpeq8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: cmpeq8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: cmpeq16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: cmpeq16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: cmpeq16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: cras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: cras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: cras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: crsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: crsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: crsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: addi a2, zero, 5 +; RV64P-NEXT: insb a0, a1, a2 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: kabs8 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: kabs8 a0, a0 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: kabs16 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: kabs16 a0, a0 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: kabsw a0, a0 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kadd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: kadd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kadd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kadd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kaddh a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kaddw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: kdmbb a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kdmbb a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: kdmbt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kdmbt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: kdmtt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kdmtt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: lwu a1, 340(sp) +; RV64P-NEXT: lwu a2, 336(sp) +; RV64P-NEXT: kdmabb a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kdmabb a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: lwu a1, 340(sp) +; RV64P-NEXT: lwu a2, 336(sp) +; RV64P-NEXT: kdmabt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kdmabt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: lwu a1, 340(sp) +; RV64P-NEXT: lwu a2, 336(sp) +; RV64P-NEXT: kdmatt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kdmatt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: khm8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: khm8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: khmx8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: khmx8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: khm16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: khm16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: khmx16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: khmx16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: khmbb a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: khmbb a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: khmbt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: khmbt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: khmtt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: khmtt a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmabb a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmabb a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmabt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmabt a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmatt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmatt a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmada a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmada a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmaxda a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmaxda a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmads a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmads a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmadrs a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmadrs a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmaxds a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmaxds a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmda a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmda a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmxda a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmxda a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 320(sp) +; RV64P-NEXT: ld a2, 312(sp) +; RV64P-NEXT: kmmac a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 48(sp) +; RV64P-NEXT: kmmac a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 320(sp) +; RV64P-NEXT: ld a2, 312(sp) +; RV64P-NEXT: kmmac.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 48(sp) +; RV64P-NEXT: kmmac.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawb a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawb a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawb.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawb.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawb2 a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawb2 a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawb2.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawb2.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawt a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawt a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawt.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawt.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawt2 a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawt2 a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmmawt2.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmmawt2.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 320(sp) +; RV64P-NEXT: ld a2, 312(sp) +; RV64P-NEXT: kmmsb a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 48(sp) +; RV64P-NEXT: kmmsb a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 320(sp) +; RV64P-NEXT: ld a2, 312(sp) +; RV64P-NEXT: kmmsb.u a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 56(sp) +; RV64P-NEXT: ld a2, 48(sp) +; RV64P-NEXT: kmmsb.u a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmmwb2 a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmmwb2 a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmmwb2.u a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmmwb2.u a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmmwt2 a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmmwt2 a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kmmwt2.u a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kmmwt2.u a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmsda a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmsda a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 328(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: kmsxda a0, a1, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 64(sp) +; RV64P-NEXT: ld a1, 112(sp) +; RV64P-NEXT: ld a2, 104(sp) +; RV64P-NEXT: kmsxda a0, a1, a2 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: ksllw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ksll8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ksll8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ksll16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ksll16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra8.u a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra8.u a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra16.u a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: kslra16.u a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kstas16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kstas16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: kstsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: kstsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ksub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: ksub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ksub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: ksub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: ksubh a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: ksubw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 312(sp) +; RV64P-NEXT: kwmmul a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 48(sp) +; RV64P-NEXT: kwmmul a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: ld a1, 312(sp) +; RV64P-NEXT: kwmmul.u a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 56(sp) +; RV64P-NEXT: ld a1, 48(sp) +; RV64P-NEXT: kwmmul.u a0, a0, a1 +; RV64P-NEXT: sd a0, 40(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: maxw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: minw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: pbsad a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: pbsad a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 296(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: pbsada a0, a1, a2 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 296(sp) +; RV64P-NEXT: ld a1, 152(sp) +; RV64P-NEXT: ld a2, 144(sp) +; RV64P-NEXT: pbsada a0, a1, a2 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: pkbb16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: pkbb16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: pkbt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: pkbt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: pktt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: pktt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: pktb16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: pktb16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: radd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: radd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: radd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: radd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: raddw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: rcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: rcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: rcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: rcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rsub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: rsub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: rsub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: rsub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lw a0, 348(sp) +; RV64P-NEXT: lw a1, 344(sp) +; RV64P-NEXT: rsubw a0, a0, a1 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: addi a0, zero, 7 +; RV64P-NEXT: sclip8 a1, a1, a0 +; RV64P-NEXT: sd a1, 264(sp) +; RV64P-NEXT: ld a1, 176(sp) +; RV64P-NEXT: sclip8 a1, a1, a0 +; RV64P-NEXT: sd a1, 160(sp) +; RV64P-NEXT: ld a2, 288(sp) +; RV64P-NEXT: addi a1, zero, 8 +; RV64P-NEXT: sclip16 a2, a2, a1 +; RV64P-NEXT: sd a2, 264(sp) +; RV64P-NEXT: ld a2, 112(sp) +; RV64P-NEXT: sclip16 a2, a2, a1 +; RV64P-NEXT: sd a2, 96(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: addi a2, zero, 9 +; RV64P-NEXT: sclip32 a3, a3, a2 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: sclip32 a3, a3, a2 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: scmple8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: ld a4, 168(sp) +; RV64P-NEXT: scmple8 a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: scmple16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: scmple16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: scmplt8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: ld a4, 168(sp) +; RV64P-NEXT: scmplt8 a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: scmplt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: scmplt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sll8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sll8 a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sll16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sll16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 328(sp) +; RV64P-NEXT: ld a4, 288(sp) +; RV64P-NEXT: ld a5, 280(sp) +; RV64P-NEXT: smaqa a3, a4, a5 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 64(sp) +; RV64P-NEXT: ld a4, 176(sp) +; RV64P-NEXT: ld a5, 168(sp) +; RV64P-NEXT: smaqa a3, a4, a5 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 328(sp) +; RV64P-NEXT: ld a4, 288(sp) +; RV64P-NEXT: ld a5, 280(sp) +; RV64P-NEXT: smaqa.su a3, a4, a5 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 64(sp) +; RV64P-NEXT: ld a4, 176(sp) +; RV64P-NEXT: ld a5, 168(sp) +; RV64P-NEXT: smaqa.su a3, a4, a5 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smax8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: ld a4, 168(sp) +; RV64P-NEXT: smax8 a3, a3, a4 +; RV64P-NEXT: sd a3, 160(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smax16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smax16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smbb16 a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smbb16 a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smbt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smbt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smtt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smtt16 a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smds a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smds a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smdrs a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smdrs a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smxds a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smxds a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smin8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: ld a4, 168(sp) +; RV64P-NEXT: smin8 a3, a3, a4 +; RV64P-NEXT: sd a3, 160(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smin16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smin16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 312(sp) +; RV64P-NEXT: smmul a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 48(sp) +; RV64P-NEXT: smmul a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 312(sp) +; RV64P-NEXT: smmul.u a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 48(sp) +; RV64P-NEXT: smmul.u a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smmwb a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smmwb a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smmwb.u a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smmwb.u a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smmwt a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smmwt a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: smmwt.u a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 56(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: smmwt.u a3, a3, a4 +; RV64P-NEXT: sd a3, 40(sp) +; RV64P-NEXT: ld a3, 320(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra.u a3, a3, a4 +; RV64P-NEXT: sd a3, 304(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra8 a3, a3, a4 +; RV64P-NEXT: sd a3, 160(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra8.u a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra8.u a3, a3, a4 +; RV64P-NEXT: sd a3, 160(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra16.u a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: sra16.u a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl8 a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl8.u a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl8.u a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl16.u a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: lwu a4, 336(sp) +; RV64P-NEXT: srl16.u a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: stas16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: ld a4, 80(sp) +; RV64P-NEXT: stas16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: stas16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: stsa16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: ld a4, 80(sp) +; RV64P-NEXT: stsa16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: stsa16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: sub8 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: ld a4, 144(sp) +; RV64P-NEXT: sub8 a3, a3, a4 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: ld a4, 168(sp) +; RV64P-NEXT: sub8 a3, a3, a4 +; RV64P-NEXT: sd a3, 160(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: ld a4, 280(sp) +; RV64P-NEXT: sub16 a3, a3, a4 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: ld a4, 80(sp) +; RV64P-NEXT: sub16 a3, a3, a4 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 112(sp) +; RV64P-NEXT: ld a4, 104(sp) +; RV64P-NEXT: sub16 a3, a3, a4 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: sunpkd810 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: sunpkd810 a3, a3 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: sunpkd820 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: sunpkd820 a3, a3 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: sunpkd830 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: sunpkd830 a3, a3 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: sunpkd831 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: sunpkd831 a3, a3 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: sunpkd832 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 176(sp) +; RV64P-NEXT: sunpkd832 a3, a3 +; RV64P-NEXT: sd a3, 96(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: swap8 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: swap8 a3, a3 +; RV64P-NEXT: sd a3, 136(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: swap16 a3, a3 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 88(sp) +; RV64P-NEXT: swap16 a3, a3 +; RV64P-NEXT: sd a3, 72(sp) +; RV64P-NEXT: ld a3, 288(sp) +; RV64P-NEXT: uclip8 a3, a3, a0 +; RV64P-NEXT: sd a3, 264(sp) +; RV64P-NEXT: ld a3, 152(sp) +; RV64P-NEXT: uclip8 a0, a3, a0 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: uclip16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: uclip16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 320(sp) +; RV64P-NEXT: uclip32 a0, a0, a2 +; RV64P-NEXT: sd a0, 304(sp) +; RV64P-NEXT: ld a0, 24(sp) +; RV64P-NEXT: uclip32 a0, a0, a2 +; RV64P-NEXT: sd a0, 8(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ucmple8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: ucmple8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ucmple16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ucmple16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ucmplt8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: ucmplt8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ucmplt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ucmplt16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukadd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 176(sp) +; RV64P-NEXT: ld a1, 168(sp) +; RV64P-NEXT: ukadd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 160(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukadd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 112(sp) +; RV64P-NEXT: ld a1, 104(sp) +; RV64P-NEXT: ukadd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 96(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ukaddh a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ukaddw a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ukcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ukcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukstas16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ukstas16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ukstsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ukstsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: uksub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: uksub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: uksub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: uksub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: uksubh a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: uksubw a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 296(sp) +; RV64P-NEXT: ld a1, 288(sp) +; RV64P-NEXT: ld a2, 280(sp) +; RV64P-NEXT: umaqa a0, a1, a2 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 32(sp) +; RV64P-NEXT: ld a1, 152(sp) +; RV64P-NEXT: ld a2, 144(sp) +; RV64P-NEXT: umaqa a0, a1, a2 +; RV64P-NEXT: sd a0, 8(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: umax8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: umax8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: umax16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: umax16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: umin8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: umin8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: umin16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: umin16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: uradd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: uradd8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: uradd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: uradd16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: uraddw a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: urcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: urcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: urcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: urcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: urcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: urcras16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: urcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: urcrsa16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ursub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: ld a1, 144(sp) +; RV64P-NEXT: ursub8 a0, a0, a1 +; RV64P-NEXT: sd a0, 136(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: ld a1, 280(sp) +; RV64P-NEXT: ursub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 88(sp) +; RV64P-NEXT: ld a1, 80(sp) +; RV64P-NEXT: ursub16 a0, a0, a1 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: lwu a0, 340(sp) +; RV64P-NEXT: lwu a1, 336(sp) +; RV64P-NEXT: ursubw a0, a0, a1 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: zunpkd810 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: zunpkd810 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: zunpkd820 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: zunpkd820 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: zunpkd830 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: zunpkd830 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: zunpkd831 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: zunpkd831 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: ld a0, 288(sp) +; RV64P-NEXT: zunpkd832 a0, a0 +; RV64P-NEXT: sd a0, 264(sp) +; RV64P-NEXT: ld a0, 152(sp) +; RV64P-NEXT: zunpkd832 a0, a0 +; RV64P-NEXT: sd a0, 72(sp) +; RV64P-NEXT: addi sp, sp, 352 +; RV64P-NEXT: ret +entry: + %i_a = alloca i32, align 4 + %i_b = alloca i32, align 4 + %ui_a = alloca i32, align 4 + %ui_b = alloca i32, align 4 + %l_t = alloca i64, align 8 + %l_a = alloca i64, align 8 + %l_b = alloca i64, align 8 + %l_r = alloca i64, align 8 + %ul_t = alloca i64, align 8 + %ul_a = alloca i64, align 8 + %ul_b = alloca i64, align 8 + %ul_c = alloca i64, align 8 + %ul_r = alloca i64, align 8 + %ll_t = alloca i64, align 8 + %ll_a = alloca i64, align 8 + %ll_b = alloca i64, align 8 + %ll_r = alloca i64, align 8 + %ull_t = alloca i64, align 8 + %ull_a = alloca i64, align 8 + %ull_b = alloca i64, align 8 + %ull_r = alloca i64, align 8 + %i8x4_a = alloca <4 x i8>, align 4 + %i8x4_b = alloca <4 x i8>, align 4 + %u8x4_a = alloca <4 x i8>, align 4 + %u8x4_b = alloca <4 x i8>, align 4 + %i8x8_a = alloca <8 x i8>, align 8 + %i8x8_b = alloca <8 x i8>, align 8 + %i8x8_r = alloca <8 x i8>, align 8 + %u8x8_a = alloca <8 x i8>, align 8 + %u8x8_b = alloca <8 x i8>, align 8 + %u8x8_r = alloca <8 x i8>, align 8 + %i16x2_a = alloca <2 x i16>, align 4 + %i16x2_b = alloca <2 x i16>, align 4 + %u16x2_a = alloca <2 x i16>, align 4 + %u16x2_b = alloca <2 x i16>, align 4 + %i16x4_a = alloca <4 x i16>, align 8 + %i16x4_b = alloca <4 x i16>, align 8 + %i16x4_r = alloca <4 x i16>, align 8 + %u16x4_a = alloca <4 x i16>, align 8 + %u16x4_b = alloca <4 x i16>, align 8 + %u16x4_r = alloca <4 x i16>, align 8 + %i32x2_t = alloca <2 x i32>, align 8 + %i32x2_a = alloca <2 x i32>, align 8 + %i32x2_b = alloca <2 x i32>, align 8 + %i32x2_r = alloca <2 x i32>, align 8 + %u32x2_t = alloca <2 x i32>, align 8 + %u32x2_a = alloca <2 x i32>, align 8 + %u32x2_b = alloca <2 x i32>, align 8 + %u32x2_r = alloca <2 x i32>, align 8 + store volatile i32 0, i32* %i_a, align 4 + store volatile i32 1, i32* %i_b, align 4 + store volatile i32 0, i32* %ui_a, align 4 + store volatile i32 1, i32* %ui_b, align 4 + store volatile i64 0, i64* %l_t, align 8 + store volatile i64 0, i64* %l_a, align 8 + store volatile i64 1, i64* %l_b, align 8 + store volatile i64 0, i64* %ul_t, align 8 + store volatile i64 1, i64* %ul_a, align 8 + store volatile i64 2, i64* %ul_b, align 8 + store volatile i64 3, i64* %ul_c, align 8 + store volatile i64 0, i64* %ll_t, align 8 + store volatile i64 1, i64* %ll_a, align 8 + store volatile i64 2, i64* %ll_b, align 8 + store volatile i64 0, i64* %ull_t, align 8 + store volatile i64 1, i64* %ull_a, align 8 + store volatile i64 2, i64* %ull_b, align 8 + store volatile <4 x i8> , <4 x i8>* %i8x4_a, align 4 + store volatile <4 x i8> , <4 x i8>* %i8x4_b, align 4 + store volatile <4 x i8> , <4 x i8>* %u8x4_a, align 4 + store volatile <4 x i8> , <4 x i8>* %u8x4_b, align 4 + store volatile <8 x i8> , <8 x i8>* %i8x8_a, align 8 + store volatile <8 x i8> , <8 x i8>* %i8x8_b, align 8 + store volatile <8 x i8> , <8 x i8>* %u8x8_a, align 8 + store volatile <8 x i8> , <8 x i8>* %u8x8_b, align 8 + store volatile <2 x i16> , <2 x i16>* %i16x2_a, align 4 + store volatile <2 x i16> , <2 x i16>* %i16x2_b, align 4 + store volatile <2 x i16> , <2 x i16>* %u16x2_a, align 4 + store volatile <2 x i16> , <2 x i16>* %u16x2_b, align 4 + store volatile <4 x i16> , <4 x i16>* %i16x4_a, align 8 + store volatile <4 x i16> , <4 x i16>* %i16x4_b, align 8 + store volatile <4 x i16> , <4 x i16>* %u16x4_a, align 8 + store volatile <4 x i16> , <4 x i16>* %u16x4_b, align 8 + store volatile <2 x i32> , <2 x i32>* %i32x2_t, align 8 + store volatile <2 x i32> , <2 x i32>* %i32x2_a, align 8 + store volatile <2 x i32> , <2 x i32>* %i32x2_b, align 8 + store volatile <2 x i32> , <2 x i32>* %u32x2_t, align 8 + store volatile <2 x i32> , <2 x i32>* %u32x2_a, align 8 + store volatile <2 x i32> , <2 x i32>* %u32x2_b, align 8 + %0 = load i64, i64* %ul_a, align 8 + %1 = load i64, i64* %ul_b, align 8 + %2 = call i64 @llvm.riscv.add8.i64(i64 %0, i64 %1) + store volatile i64 %2, i64* %ul_r, align 8 + %3 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %4 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %5 = call <8 x i8> @llvm.riscv.v.add8.v8i8(<8 x i8> %3, <8 x i8> %4) + store volatile <8 x i8> %5, <8 x i8>* %u8x8_r, align 8 + %6 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %7 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %8 = call <8 x i8> @llvm.riscv.v.add8.v8i8(<8 x i8> %6, <8 x i8> %7) + store volatile <8 x i8> %8, <8 x i8>* %i8x8_r, align 8 + %9 = load i64, i64* %ul_a, align 8 + %10 = load i64, i64* %ul_b, align 8 + %11 = call i64 @llvm.riscv.add16.i64(i64 %9, i64 %10) + store volatile i64 %11, i64* %ul_r, align 8 + %12 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %13 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %14 = call <4 x i16> @llvm.riscv.v.add16.v4i16(<4 x i16> %12, <4 x i16> %13) + store volatile <4 x i16> %14, <4 x i16>* %u16x4_r, align 8 + %15 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %16 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %17 = call <4 x i16> @llvm.riscv.v.add16.v4i16(<4 x i16> %15, <4 x i16> %16) + store volatile <4 x i16> %17, <4 x i16>* %i16x4_r, align 8 + %18 = load i64, i64* %l_a, align 8 + %19 = load i64, i64* %l_b, align 8 + %20 = call i64 @llvm.riscv.ave.i64(i64 %18, i64 %19) + store volatile i64 %20, i64* %l_r, align 8 + %21 = load i64, i64* %ul_a, align 8 + %22 = load i64, i64* %ul_b, align 8 + %23 = call i64 @llvm.riscv.bitrev.i64(i64 %21, i64 %22) + store volatile i64 %23, i64* %ul_r, align 8 + %24 = load i64, i64* %ul_a, align 8 + %25 = load i64, i64* %ul_b, align 8 + %26 = load i64, i64* %ul_c, align 8 + %27 = call i64 @llvm.riscv.bpick.i64(i64 %24, i64 %25, i64 %26) + store volatile i64 %27, i64* %ul_r, align 8 + %28 = load i64, i64* %ul_a, align 8 + %29 = call i64 @llvm.riscv.clrs8.i64(i64 %28) + store volatile i64 %29, i64* %ul_r, align 8 + %30 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %31 = call <8 x i8> @llvm.riscv.v.clrs8.v8i8(<8 x i8> %30) + store volatile <8 x i8> %31, <8 x i8>* %u8x8_r, align 8 + %32 = load i64, i64* %ul_a, align 8 + %33 = call i64 @llvm.riscv.clrs16.i64(i64 %32) + store volatile i64 %33, i64* %ul_r, align 8 + %34 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %35 = call <4 x i16> @llvm.riscv.v.clrs16.v4i16(<4 x i16> %34) + store volatile <4 x i16> %35, <4 x i16>* %u16x4_r, align 8 + %36 = load i64, i64* %ul_a, align 8 + %37 = call i64 @llvm.riscv.clrs32.i64(i64 %36) + store volatile i64 %37, i64* %ul_r, align 8 + %38 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %39 = call <2 x i32> @llvm.riscv.v.clrs32.v2i32(<2 x i32> %38) + store volatile <2 x i32> %39, <2 x i32>* %u32x2_r, align 8 + %40 = load i64, i64* %ul_a, align 8 + %41 = call i64 @llvm.riscv.clo8.i64(i64 %40) + store volatile i64 %41, i64* %ul_r, align 8 + %42 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %43 = call <8 x i8> @llvm.riscv.v.clo8.v8i8(<8 x i8> %42) + store volatile <8 x i8> %43, <8 x i8>* %u8x8_r, align 8 + %44 = load i64, i64* %ul_a, align 8 + %45 = call i64 @llvm.riscv.clo16.i64(i64 %44) + store volatile i64 %45, i64* %ul_r, align 8 + %46 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %47 = call <4 x i16> @llvm.riscv.v.clo16.v4i16(<4 x i16> %46) + store volatile <4 x i16> %47, <4 x i16>* %u16x4_r, align 8 + %48 = load i64, i64* %ul_a, align 8 + %49 = call i64 @llvm.riscv.clo32.i64(i64 %48) + store volatile i64 %49, i64* %ul_r, align 8 + %50 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %51 = call <2 x i32> @llvm.riscv.v.clo32.v2i32(<2 x i32> %50) + store volatile <2 x i32> %51, <2 x i32>* %u32x2_r, align 8 + %52 = load i64, i64* %ul_a, align 8 + %53 = call i64 @llvm.riscv.clz8.i64(i64 %52) + store volatile i64 %53, i64* %ul_r, align 8 + %54 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %55 = call <8 x i8> @llvm.riscv.v.clz8.v8i8(<8 x i8> %54) + store volatile <8 x i8> %55, <8 x i8>* %u8x8_r, align 8 + %56 = load i64, i64* %ul_a, align 8 + %57 = call i64 @llvm.riscv.clz16.i64(i64 %56) + store volatile i64 %57, i64* %ul_r, align 8 + %58 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %59 = call <4 x i16> @llvm.riscv.v.clz16.v4i16(<4 x i16> %58) + store volatile <4 x i16> %59, <4 x i16>* %u16x4_r, align 8 + %60 = load i64, i64* %ul_a, align 8 + %61 = call i64 @llvm.riscv.clz32.i64(i64 %60) + store volatile i64 %61, i64* %ul_r, align 8 + %62 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %63 = call <2 x i32> @llvm.riscv.v.clz32.v2i32(<2 x i32> %62) + store volatile <2 x i32> %63, <2 x i32>* %u32x2_r, align 8 + %64 = load i64, i64* %ul_a, align 8 + %65 = load i64, i64* %ul_b, align 8 + %66 = call i64 @llvm.riscv.cmpeq8.i64(i64 %64, i64 %65) + store volatile i64 %66, i64* %ul_r, align 8 + %67 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %68 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %69 = call <8 x i8> @llvm.riscv.v.cmpeq8.v8i8(<8 x i8> %67, <8 x i8> %68) + store volatile <8 x i8> %69, <8 x i8>* %u8x8_r, align 8 + %70 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %71 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %72 = call <8 x i8> @llvm.riscv.v.cmpeq8.v8i8(<8 x i8> %70, <8 x i8> %71) + store volatile <8 x i8> %72, <8 x i8>* %u8x8_r, align 8 + %73 = load i64, i64* %ul_a, align 8 + %74 = load i64, i64* %ul_b, align 8 + %75 = call i64 @llvm.riscv.cmpeq16.i64(i64 %73, i64 %74) + store volatile i64 %75, i64* %ul_r, align 8 + %76 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %77 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %78 = call <4 x i16> @llvm.riscv.v.cmpeq16.v4i16(<4 x i16> %76, <4 x i16> %77) + store volatile <4 x i16> %78, <4 x i16>* %u16x4_r, align 8 + %79 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %80 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %81 = call <4 x i16> @llvm.riscv.v.cmpeq16.v4i16(<4 x i16> %79, <4 x i16> %80) + store volatile <4 x i16> %81, <4 x i16>* %u16x4_r, align 8 + %82 = load i64, i64* %ul_a, align 8 + %83 = load i64, i64* %ul_b, align 8 + %84 = call i64 @llvm.riscv.cras16.i64(i64 %82, i64 %83) + store volatile i64 %84, i64* %ul_r, align 8 + %85 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %86 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %87 = call <4 x i16> @llvm.riscv.v.cras16.v4i16(<4 x i16> %85, <4 x i16> %86) + store volatile <4 x i16> %87, <4 x i16>* %u16x4_r, align 8 + %88 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %89 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %90 = call <4 x i16> @llvm.riscv.v.cras16.v4i16(<4 x i16> %88, <4 x i16> %89) + store volatile <4 x i16> %90, <4 x i16>* %i16x4_r, align 8 + %91 = load i64, i64* %ul_a, align 8 + %92 = load i64, i64* %ul_b, align 8 + %93 = call i64 @llvm.riscv.crsa16.i64(i64 %91, i64 %92) + store volatile i64 %93, i64* %ul_r, align 8 + %94 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %95 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %96 = call <4 x i16> @llvm.riscv.v.crsa16.v4i16(<4 x i16> %94, <4 x i16> %95) + store volatile <4 x i16> %96, <4 x i16>* %u16x4_r, align 8 + %97 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %98 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %99 = call <4 x i16> @llvm.riscv.v.crsa16.v4i16(<4 x i16> %97, <4 x i16> %98) + store volatile <4 x i16> %99, <4 x i16>* %i16x4_r, align 8 + %100 = load i64, i64* %ul_a, align 8 + %101 = load i64, i64* %ul_b, align 8 + %102 = call i64 @llvm.riscv.insb.i64(i64 %100, i64 %101, i64 5) + store volatile i64 %102, i64* %ul_r, align 8 + %103 = load i64, i64* %ul_a, align 8 + %104 = call i64 @llvm.riscv.kabs8.i64(i64 %103) + store volatile i64 %104, i64* %ul_r, align 8 + %105 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %106 = call <8 x i8> @llvm.riscv.v.kabs8.v8i8(<8 x i8> %105) + store volatile <8 x i8> %106, <8 x i8>* %i8x8_r, align 8 + %107 = load i64, i64* %ul_a, align 8 + %108 = call i64 @llvm.riscv.kabs16.i64(i64 %107) + store volatile i64 %108, i64* %ul_r, align 8 + %109 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %110 = call <4 x i16> @llvm.riscv.v.kabs16.v4i16(<4 x i16> %109) + store volatile <4 x i16> %110, <4 x i16>* %i16x4_r, align 8 + %111 = load i64, i64* %l_a, align 8 + %112 = call i64 @llvm.riscv.kabsw.i64(i64 %111) + store volatile i64 %112, i64* %l_r, align 8 + %113 = load i64, i64* %ul_a, align 8 + %114 = load i64, i64* %ul_b, align 8 + %115 = call i64 @llvm.riscv.kadd8.i64(i64 %113, i64 %114) + store volatile i64 %115, i64* %ul_r, align 8 + %116 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %117 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %118 = call <8 x i8> @llvm.riscv.v.kadd8.v8i8(<8 x i8> %116, <8 x i8> %117) + store volatile <8 x i8> %118, <8 x i8>* %i8x8_r, align 8 + %119 = load i64, i64* %ul_a, align 8 + %120 = load i64, i64* %ul_b, align 8 + %121 = call i64 @llvm.riscv.kadd16.i64(i64 %119, i64 %120) + store volatile i64 %121, i64* %ul_r, align 8 + %122 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %123 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %124 = call <4 x i16> @llvm.riscv.v.kadd16.v4i16(<4 x i16> %122, <4 x i16> %123) + store volatile <4 x i16> %124, <4 x i16>* %i16x4_r, align 8 + %125 = load i32, i32* %i_a, align 4 + %conv = sext i32 %125 to i64 + %126 = load i32, i32* %i_b, align 4 + %conv1 = sext i32 %126 to i64 + %127 = call i64 @llvm.riscv.kaddh.i64(i64 %conv, i64 %conv1) + store volatile i64 %127, i64* %l_r, align 8 + %128 = load i32, i32* %i_a, align 4 + %conv2 = sext i32 %128 to i64 + %129 = load i32, i32* %i_b, align 4 + %conv3 = sext i32 %129 to i64 + %130 = call i64 @llvm.riscv.kaddw.i64(i64 %conv2, i64 %conv3) + store volatile i64 %130, i64* %l_r, align 8 + %131 = load i64, i64* %ul_a, align 8 + %132 = load i64, i64* %ul_b, align 8 + %133 = call i64 @llvm.riscv.kcras16.i64(i64 %131, i64 %132) + store volatile i64 %133, i64* %ul_r, align 8 + %134 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %135 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %136 = call <4 x i16> @llvm.riscv.v.kcras16.v4i16(<4 x i16> %134, <4 x i16> %135) + store volatile <4 x i16> %136, <4 x i16>* %i16x4_r, align 8 + %137 = load i64, i64* %ul_a, align 8 + %138 = load i64, i64* %ul_b, align 8 + %139 = call i64 @llvm.riscv.kcrsa16.i64(i64 %137, i64 %138) + store volatile i64 %139, i64* %ul_r, align 8 + %140 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %141 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %142 = call <4 x i16> @llvm.riscv.v.kcrsa16.v4i16(<4 x i16> %140, <4 x i16> %141) + store volatile <4 x i16> %142, <4 x i16>* %i16x4_r, align 8 + %143 = load i32, i32* %ui_a, align 4 + %conv4 = zext i32 %143 to i64 + %144 = load i32, i32* %ui_b, align 4 + %conv5 = zext i32 %144 to i64 + %145 = call i64 @llvm.riscv.kdmbb.i64(i64 %conv4, i64 %conv5) + store volatile i64 %145, i64* %l_r, align 8 + %146 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %147 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %148 = call i64 @llvm.riscv.v.kdmbb.i64.v4i16(<4 x i16> %146, <4 x i16> %147) + store volatile i64 %148, i64* %l_r, align 8 + %149 = load i32, i32* %ui_a, align 4 + %conv6 = zext i32 %149 to i64 + %150 = load i32, i32* %ui_b, align 4 + %conv7 = zext i32 %150 to i64 + %151 = call i64 @llvm.riscv.kdmbt.i64(i64 %conv6, i64 %conv7) + store volatile i64 %151, i64* %l_r, align 8 + %152 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %153 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %154 = call i64 @llvm.riscv.v.kdmbt.i64.v4i16(<4 x i16> %152, <4 x i16> %153) + store volatile i64 %154, i64* %l_r, align 8 + %155 = load i32, i32* %ui_a, align 4 + %conv8 = zext i32 %155 to i64 + %156 = load i32, i32* %ui_b, align 4 + %conv9 = zext i32 %156 to i64 + %157 = call i64 @llvm.riscv.kdmtt.i64(i64 %conv8, i64 %conv9) + store volatile i64 %157, i64* %l_r, align 8 + %158 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %159 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %160 = call i64 @llvm.riscv.v.kdmtt.i64.v4i16(<4 x i16> %158, <4 x i16> %159) + store volatile i64 %160, i64* %l_r, align 8 + %161 = load i64, i64* %l_t, align 8 + %162 = load i32, i32* %ui_a, align 4 + %conv10 = zext i32 %162 to i64 + %163 = load i32, i32* %ui_b, align 4 + %conv11 = zext i32 %163 to i64 + %164 = call i64 @llvm.riscv.kdmabb.i64(i64 %161, i64 %conv10, i64 %conv11) + store volatile i64 %164, i64* %l_r, align 8 + %165 = load i64, i64* %l_t, align 8 + %166 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %167 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %168 = call i64 @llvm.riscv.v.kdmabb.i64.v4i16(i64 %165, <4 x i16> %166, <4 x i16> %167) + store volatile i64 %168, i64* %l_r, align 8 + %169 = load i64, i64* %l_t, align 8 + %170 = load i32, i32* %ui_a, align 4 + %conv12 = zext i32 %170 to i64 + %171 = load i32, i32* %ui_b, align 4 + %conv13 = zext i32 %171 to i64 + %172 = call i64 @llvm.riscv.kdmabt.i64(i64 %169, i64 %conv12, i64 %conv13) + store volatile i64 %172, i64* %l_r, align 8 + %173 = load i64, i64* %l_t, align 8 + %174 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %175 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %176 = call i64 @llvm.riscv.v.kdmabt.i64.v4i16(i64 %173, <4 x i16> %174, <4 x i16> %175) + store volatile i64 %176, i64* %l_r, align 8 + %177 = load i64, i64* %l_t, align 8 + %178 = load i32, i32* %ui_a, align 4 + %conv14 = zext i32 %178 to i64 + %179 = load i32, i32* %ui_b, align 4 + %conv15 = zext i32 %179 to i64 + %180 = call i64 @llvm.riscv.kdmatt.i64(i64 %177, i64 %conv14, i64 %conv15) + store volatile i64 %180, i64* %l_r, align 8 + %181 = load i64, i64* %l_t, align 8 + %182 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %183 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %184 = call i64 @llvm.riscv.v.kdmatt.i64.v4i16(i64 %181, <4 x i16> %182, <4 x i16> %183) + store volatile i64 %184, i64* %l_r, align 8 + %185 = load i64, i64* %ul_a, align 8 + %186 = load i64, i64* %ul_b, align 8 + %187 = call i64 @llvm.riscv.khm8.i64(i64 %185, i64 %186) + store volatile i64 %187, i64* %ul_r, align 8 + %188 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %189 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %190 = call <8 x i8> @llvm.riscv.v.khm8.v8i8(<8 x i8> %188, <8 x i8> %189) + store volatile <8 x i8> %190, <8 x i8>* %i8x8_r, align 8 + %191 = load i64, i64* %ul_a, align 8 + %192 = load i64, i64* %ul_b, align 8 + %193 = call i64 @llvm.riscv.khmx8.i64(i64 %191, i64 %192) + store volatile i64 %193, i64* %ul_r, align 8 + %194 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %195 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %196 = call <8 x i8> @llvm.riscv.v.khmx8.v8i8(<8 x i8> %194, <8 x i8> %195) + store volatile <8 x i8> %196, <8 x i8>* %i8x8_r, align 8 + %197 = load i64, i64* %ul_a, align 8 + %198 = load i64, i64* %ul_b, align 8 + %199 = call i64 @llvm.riscv.khm16.i64(i64 %197, i64 %198) + store volatile i64 %199, i64* %ul_r, align 8 + %200 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %201 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %202 = call <4 x i16> @llvm.riscv.v.khm16.v4i16(<4 x i16> %200, <4 x i16> %201) + store volatile <4 x i16> %202, <4 x i16>* %i16x4_r, align 8 + %203 = load i64, i64* %ul_a, align 8 + %204 = load i64, i64* %ul_b, align 8 + %205 = call i64 @llvm.riscv.khmx16.i64(i64 %203, i64 %204) + store volatile i64 %205, i64* %ul_r, align 8 + %206 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %207 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %208 = call <4 x i16> @llvm.riscv.v.khmx16.v4i16(<4 x i16> %206, <4 x i16> %207) + store volatile <4 x i16> %208, <4 x i16>* %i16x4_r, align 8 + %209 = load i32, i32* %ui_a, align 4 + %conv16 = zext i32 %209 to i64 + %210 = load i32, i32* %ui_b, align 4 + %conv17 = zext i32 %210 to i64 + %211 = call i64 @llvm.riscv.khmbb.i64(i64 %conv16, i64 %conv17) + store volatile i64 %211, i64* %l_r, align 8 + %212 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %213 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %214 = call i64 @llvm.riscv.v.khmbb.i64.v4i16(<4 x i16> %212, <4 x i16> %213) + store volatile i64 %214, i64* %l_r, align 8 + %215 = load i32, i32* %ui_a, align 4 + %conv18 = zext i32 %215 to i64 + %216 = load i32, i32* %ui_b, align 4 + %conv19 = zext i32 %216 to i64 + %217 = call i64 @llvm.riscv.khmbt.i64(i64 %conv18, i64 %conv19) + store volatile i64 %217, i64* %l_r, align 8 + %218 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %219 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %220 = call i64 @llvm.riscv.v.khmbt.i64.v4i16(<4 x i16> %218, <4 x i16> %219) + store volatile i64 %220, i64* %l_r, align 8 + %221 = load i32, i32* %ui_a, align 4 + %conv20 = zext i32 %221 to i64 + %222 = load i32, i32* %ui_b, align 4 + %conv21 = zext i32 %222 to i64 + %223 = call i64 @llvm.riscv.khmtt.i64(i64 %conv20, i64 %conv21) + store volatile i64 %223, i64* %l_r, align 8 + %224 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %225 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %226 = call i64 @llvm.riscv.v.khmtt.i64.v4i16(<4 x i16> %224, <4 x i16> %225) + store volatile i64 %226, i64* %l_r, align 8 + %227 = load i64, i64* %l_t, align 8 + %228 = load i64, i64* %ul_a, align 8 + %229 = load i64, i64* %ul_b, align 8 + %230 = call i64 @llvm.riscv.kmabb.i64(i64 %227, i64 %228, i64 %229) + store volatile i64 %230, i64* %l_r, align 8 + %231 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %232 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %233 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %234 = call <2 x i32> @llvm.riscv.v.kmabb.v2i32.v4i16(<2 x i32> %231, <4 x i16> %232, <4 x i16> %233) + store volatile <2 x i32> %234, <2 x i32>* %i32x2_r, align 8 + %235 = load i64, i64* %l_t, align 8 + %236 = load i64, i64* %ul_a, align 8 + %237 = load i64, i64* %ul_b, align 8 + %238 = call i64 @llvm.riscv.kmabt.i64(i64 %235, i64 %236, i64 %237) + store volatile i64 %238, i64* %l_r, align 8 + %239 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %240 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %241 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %242 = call <2 x i32> @llvm.riscv.v.kmabt.v2i32.v4i16(<2 x i32> %239, <4 x i16> %240, <4 x i16> %241) + store volatile <2 x i32> %242, <2 x i32>* %i32x2_r, align 8 + %243 = load i64, i64* %l_t, align 8 + %244 = load i64, i64* %ul_a, align 8 + %245 = load i64, i64* %ul_b, align 8 + %246 = call i64 @llvm.riscv.kmatt.i64(i64 %243, i64 %244, i64 %245) + store volatile i64 %246, i64* %l_r, align 8 + %247 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %248 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %249 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %250 = call <2 x i32> @llvm.riscv.v.kmatt.v2i32.v4i16(<2 x i32> %247, <4 x i16> %248, <4 x i16> %249) + store volatile <2 x i32> %250, <2 x i32>* %i32x2_r, align 8 + %251 = load i64, i64* %l_t, align 8 + %252 = load i64, i64* %ul_a, align 8 + %253 = load i64, i64* %ul_b, align 8 + %254 = call i64 @llvm.riscv.kmada.i64(i64 %251, i64 %252, i64 %253) + store volatile i64 %254, i64* %l_r, align 8 + %255 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %256 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %257 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %258 = call <2 x i32> @llvm.riscv.v.kmada.v2i32.v4i16(<2 x i32> %255, <4 x i16> %256, <4 x i16> %257) + store volatile <2 x i32> %258, <2 x i32>* %i32x2_r, align 8 + %259 = load i64, i64* %l_t, align 8 + %260 = load i64, i64* %ul_a, align 8 + %261 = load i64, i64* %ul_b, align 8 + %262 = call i64 @llvm.riscv.kmaxda.i64(i64 %259, i64 %260, i64 %261) + store volatile i64 %262, i64* %l_r, align 8 + %263 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %264 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %265 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %266 = call <2 x i32> @llvm.riscv.v.kmaxda.v2i32.v4i16(<2 x i32> %263, <4 x i16> %264, <4 x i16> %265) + store volatile <2 x i32> %266, <2 x i32>* %i32x2_r, align 8 + %267 = load i64, i64* %l_t, align 8 + %268 = load i64, i64* %ul_a, align 8 + %269 = load i64, i64* %ul_b, align 8 + %270 = call i64 @llvm.riscv.kmads.i64(i64 %267, i64 %268, i64 %269) + store volatile i64 %270, i64* %l_r, align 8 + %271 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %272 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %273 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %274 = call <2 x i32> @llvm.riscv.v.kmads.v2i32.v4i16(<2 x i32> %271, <4 x i16> %272, <4 x i16> %273) + store volatile <2 x i32> %274, <2 x i32>* %i32x2_r, align 8 + %275 = load i64, i64* %l_t, align 8 + %276 = load i64, i64* %ul_a, align 8 + %277 = load i64, i64* %ul_b, align 8 + %278 = call i64 @llvm.riscv.kmadrs.i64(i64 %275, i64 %276, i64 %277) + store volatile i64 %278, i64* %l_r, align 8 + %279 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %280 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %281 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %282 = call <2 x i32> @llvm.riscv.v.kmadrs.v2i32.v4i16(<2 x i32> %279, <4 x i16> %280, <4 x i16> %281) + store volatile <2 x i32> %282, <2 x i32>* %i32x2_r, align 8 + %283 = load i64, i64* %l_t, align 8 + %284 = load i64, i64* %ul_a, align 8 + %285 = load i64, i64* %ul_b, align 8 + %286 = call i64 @llvm.riscv.kmaxds.i64(i64 %283, i64 %284, i64 %285) + store volatile i64 %286, i64* %l_r, align 8 + %287 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %288 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %289 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %290 = call <2 x i32> @llvm.riscv.v.kmaxds.v2i32.v4i16(<2 x i32> %287, <4 x i16> %288, <4 x i16> %289) + store volatile <2 x i32> %290, <2 x i32>* %i32x2_r, align 8 + %291 = load i64, i64* %ul_a, align 8 + %292 = load i64, i64* %ul_b, align 8 + %293 = call i64 @llvm.riscv.kmda.i64(i64 %291, i64 %292) + store volatile i64 %293, i64* %l_r, align 8 + %294 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %295 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %296 = call <2 x i32> @llvm.riscv.v.kmda.v2i32.v4i16(<4 x i16> %294, <4 x i16> %295) + store volatile <2 x i32> %296, <2 x i32>* %i32x2_r, align 8 + %297 = load i64, i64* %ul_a, align 8 + %298 = load i64, i64* %ul_b, align 8 + %299 = call i64 @llvm.riscv.kmxda.i64(i64 %297, i64 %298) + store volatile i64 %299, i64* %l_r, align 8 + %300 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %301 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %302 = call <2 x i32> @llvm.riscv.v.kmxda.v2i32.v4i16(<4 x i16> %300, <4 x i16> %301) + store volatile <2 x i32> %302, <2 x i32>* %i32x2_r, align 8 + %303 = load i64, i64* %l_t, align 8 + %304 = load i64, i64* %l_a, align 8 + %305 = load i64, i64* %l_b, align 8 + %306 = call i64 @llvm.riscv.kmmac.i64(i64 %303, i64 %304, i64 %305) + store volatile i64 %306, i64* %l_r, align 8 + %307 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %308 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %309 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %310 = call <2 x i32> @llvm.riscv.v.kmmac.v2i32(<2 x i32> %307, <2 x i32> %308, <2 x i32> %309) + store volatile <2 x i32> %310, <2 x i32>* %i32x2_r, align 8 + %311 = load i64, i64* %l_t, align 8 + %312 = load i64, i64* %l_a, align 8 + %313 = load i64, i64* %l_b, align 8 + %314 = call i64 @llvm.riscv.kmmac.u.i64(i64 %311, i64 %312, i64 %313) + store volatile i64 %314, i64* %l_r, align 8 + %315 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %316 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %317 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %318 = call <2 x i32> @llvm.riscv.v.kmmac.u.v2i32(<2 x i32> %315, <2 x i32> %316, <2 x i32> %317) + store volatile <2 x i32> %318, <2 x i32>* %i32x2_r, align 8 + %319 = load i64, i64* %l_t, align 8 + %320 = load i64, i64* %ul_a, align 8 + %321 = load i64, i64* %ul_b, align 8 + %322 = call i64 @llvm.riscv.kmmawb.i64(i64 %319, i64 %320, i64 %321) + store volatile i64 %322, i64* %l_r, align 8 + %323 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %324 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %325 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %326 = call <2 x i32> @llvm.riscv.v.kmmawb.v2i32.v4i16(<2 x i32> %323, <2 x i32> %324, <4 x i16> %325) + store volatile <2 x i32> %326, <2 x i32>* %i32x2_r, align 8 + %327 = load i64, i64* %l_t, align 8 + %328 = load i64, i64* %ul_a, align 8 + %329 = load i64, i64* %ul_b, align 8 + %330 = call i64 @llvm.riscv.kmmawb.u.i64(i64 %327, i64 %328, i64 %329) + store volatile i64 %330, i64* %l_r, align 8 + %331 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %332 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %333 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %334 = call <2 x i32> @llvm.riscv.v.kmmawb.u.v2i32.v4i16(<2 x i32> %331, <2 x i32> %332, <4 x i16> %333) + store volatile <2 x i32> %334, <2 x i32>* %i32x2_r, align 8 + %335 = load i64, i64* %l_t, align 8 + %336 = load i64, i64* %ul_a, align 8 + %337 = load i64, i64* %ul_b, align 8 + %338 = call i64 @llvm.riscv.kmmawb2.i64(i64 %335, i64 %336, i64 %337) + store volatile i64 %338, i64* %l_r, align 8 + %339 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %340 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %341 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %342 = call <2 x i32> @llvm.riscv.v.kmmawb2.v2i32.v4i16(<2 x i32> %339, <2 x i32> %340, <4 x i16> %341) + store volatile <2 x i32> %342, <2 x i32>* %i32x2_r, align 8 + %343 = load i64, i64* %l_t, align 8 + %344 = load i64, i64* %ul_a, align 8 + %345 = load i64, i64* %ul_b, align 8 + %346 = call i64 @llvm.riscv.kmmawb2.u.i64(i64 %343, i64 %344, i64 %345) + store volatile i64 %346, i64* %l_r, align 8 + %347 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %348 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %349 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %350 = call <2 x i32> @llvm.riscv.v.kmmawb2.u.v2i32.v4i16(<2 x i32> %347, <2 x i32> %348, <4 x i16> %349) + store volatile <2 x i32> %350, <2 x i32>* %i32x2_r, align 8 + %351 = load i64, i64* %l_t, align 8 + %352 = load i64, i64* %ul_a, align 8 + %353 = load i64, i64* %ul_b, align 8 + %354 = call i64 @llvm.riscv.kmmawt.i64(i64 %351, i64 %352, i64 %353) + store volatile i64 %354, i64* %l_r, align 8 + %355 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %356 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %357 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %358 = call <2 x i32> @llvm.riscv.v.kmmawt.v2i32.v4i16(<2 x i32> %355, <2 x i32> %356, <4 x i16> %357) + store volatile <2 x i32> %358, <2 x i32>* %i32x2_r, align 8 + %359 = load i64, i64* %l_t, align 8 + %360 = load i64, i64* %ul_a, align 8 + %361 = load i64, i64* %ul_b, align 8 + %362 = call i64 @llvm.riscv.kmmawt.u.i64(i64 %359, i64 %360, i64 %361) + store volatile i64 %362, i64* %l_r, align 8 + %363 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %364 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %365 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %366 = call <2 x i32> @llvm.riscv.v.kmmawt.u.v2i32.v4i16(<2 x i32> %363, <2 x i32> %364, <4 x i16> %365) + store volatile <2 x i32> %366, <2 x i32>* %i32x2_r, align 8 + %367 = load i64, i64* %l_t, align 8 + %368 = load i64, i64* %ul_a, align 8 + %369 = load i64, i64* %ul_b, align 8 + %370 = call i64 @llvm.riscv.kmmawt2.i64(i64 %367, i64 %368, i64 %369) + store volatile i64 %370, i64* %l_r, align 8 + %371 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %372 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %373 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %374 = call <2 x i32> @llvm.riscv.v.kmmawt2.v2i32.v4i16(<2 x i32> %371, <2 x i32> %372, <4 x i16> %373) + store volatile <2 x i32> %374, <2 x i32>* %i32x2_r, align 8 + %375 = load i64, i64* %l_t, align 8 + %376 = load i64, i64* %ul_a, align 8 + %377 = load i64, i64* %ul_b, align 8 + %378 = call i64 @llvm.riscv.kmmawt2.u.i64(i64 %375, i64 %376, i64 %377) + store volatile i64 %378, i64* %l_r, align 8 + %379 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %380 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %381 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %382 = call <2 x i32> @llvm.riscv.v.kmmawt2.u.v2i32.v4i16(<2 x i32> %379, <2 x i32> %380, <4 x i16> %381) + store volatile <2 x i32> %382, <2 x i32>* %i32x2_r, align 8 + %383 = load i64, i64* %l_t, align 8 + %384 = load i64, i64* %l_a, align 8 + %385 = load i64, i64* %l_b, align 8 + %386 = call i64 @llvm.riscv.kmmsb.i64(i64 %383, i64 %384, i64 %385) + store volatile i64 %386, i64* %l_r, align 8 + %387 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %388 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %389 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %390 = call <2 x i32> @llvm.riscv.v.kmmsb.v2i32(<2 x i32> %387, <2 x i32> %388, <2 x i32> %389) + store volatile <2 x i32> %390, <2 x i32>* %i32x2_r, align 8 + %391 = load i64, i64* %l_t, align 8 + %392 = load i64, i64* %l_a, align 8 + %393 = load i64, i64* %l_b, align 8 + %394 = call i64 @llvm.riscv.kmmsb.u.i64(i64 %391, i64 %392, i64 %393) + store volatile i64 %394, i64* %l_r, align 8 + %395 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %396 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %397 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %398 = call <2 x i32> @llvm.riscv.v.kmmsb.u.v2i32(<2 x i32> %395, <2 x i32> %396, <2 x i32> %397) + store volatile <2 x i32> %398, <2 x i32>* %i32x2_r, align 8 + %399 = load i64, i64* %l_a, align 8 + %400 = load i64, i64* %ul_b, align 8 + %401 = call i64 @llvm.riscv.kmmwb2.i64(i64 %399, i64 %400) + store volatile i64 %401, i64* %l_r, align 8 + %402 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %403 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %404 = call <2 x i32> @llvm.riscv.v.kmmwb2.v2i32.v4i16(<2 x i32> %402, <4 x i16> %403) + store volatile <2 x i32> %404, <2 x i32>* %i32x2_r, align 8 + %405 = load i64, i64* %l_a, align 8 + %406 = load i64, i64* %ul_b, align 8 + %407 = call i64 @llvm.riscv.kmmwb2.u.i64(i64 %405, i64 %406) + store volatile i64 %407, i64* %l_r, align 8 + %408 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %409 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %410 = call <2 x i32> @llvm.riscv.v.kmmwb2.u.v2i32.v4i16(<2 x i32> %408, <4 x i16> %409) + store volatile <2 x i32> %410, <2 x i32>* %i32x2_r, align 8 + %411 = load i64, i64* %l_a, align 8 + %412 = load i64, i64* %ul_b, align 8 + %413 = call i64 @llvm.riscv.kmmwt2.i64(i64 %411, i64 %412) + store volatile i64 %413, i64* %l_r, align 8 + %414 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %415 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %416 = call <2 x i32> @llvm.riscv.v.kmmwt2.v2i32.v4i16(<2 x i32> %414, <4 x i16> %415) + store volatile <2 x i32> %416, <2 x i32>* %i32x2_r, align 8 + %417 = load i64, i64* %l_a, align 8 + %418 = load i64, i64* %ul_b, align 8 + %419 = call i64 @llvm.riscv.kmmwt2.u.i64(i64 %417, i64 %418) + store volatile i64 %419, i64* %l_r, align 8 + %420 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %421 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %422 = call <2 x i32> @llvm.riscv.v.kmmwt2.u.v2i32.v4i16(<2 x i32> %420, <4 x i16> %421) + store volatile <2 x i32> %422, <2 x i32>* %i32x2_r, align 8 + %423 = load i64, i64* %l_t, align 8 + %424 = load i64, i64* %ul_a, align 8 + %425 = load i64, i64* %ul_b, align 8 + %426 = call i64 @llvm.riscv.kmsda.i64(i64 %423, i64 %424, i64 %425) + store volatile i64 %426, i64* %l_r, align 8 + %427 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %428 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %429 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %430 = call <2 x i32> @llvm.riscv.v.kmsda.v2i32.v4i16(<2 x i32> %427, <4 x i16> %428, <4 x i16> %429) + store volatile <2 x i32> %430, <2 x i32>* %i32x2_r, align 8 + %431 = load i64, i64* %l_t, align 8 + %432 = load i64, i64* %ul_a, align 8 + %433 = load i64, i64* %ul_b, align 8 + %434 = call i64 @llvm.riscv.kmsxda.i64(i64 %431, i64 %432, i64 %433) + store volatile i64 %434, i64* %l_r, align 8 + %435 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %436 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %437 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %438 = call <2 x i32> @llvm.riscv.v.kmsxda.v2i32.v4i16(<2 x i32> %435, <4 x i16> %436, <4 x i16> %437) + store volatile <2 x i32> %438, <2 x i32>* %i32x2_r, align 8 + %439 = load i64, i64* %l_a, align 8 + %440 = load i32, i32* %i_b, align 4 + %conv22 = sext i32 %440 to i64 + %441 = call i64 @llvm.riscv.ksllw.i64(i64 %439, i64 %conv22) + store volatile i64 %441, i64* %l_r, align 8 + %442 = load i64, i64* %ul_a, align 8 + %443 = load i32, i32* %ui_b, align 4 + %conv23 = zext i32 %443 to i64 + %444 = call i64 @llvm.riscv.ksll8.i64(i64 %442, i64 %conv23) + store volatile i64 %444, i64* %ul_r, align 8 + %445 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %446 = load i32, i32* %ui_b, align 4 + %conv24 = zext i32 %446 to i64 + %447 = call <8 x i8> @llvm.riscv.v.ksll8.v8i8.i64(<8 x i8> %445, i64 %conv24) + store volatile <8 x i8> %447, <8 x i8>* %i8x8_r, align 8 + %448 = load i64, i64* %ul_a, align 8 + %449 = load i32, i32* %ui_b, align 4 + %conv25 = zext i32 %449 to i64 + %450 = call i64 @llvm.riscv.ksll16.i64(i64 %448, i64 %conv25) + store volatile i64 %450, i64* %ul_r, align 8 + %451 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %452 = load i32, i32* %ui_b, align 4 + %conv26 = zext i32 %452 to i64 + %453 = call <4 x i16> @llvm.riscv.v.ksll16.v4i16.i64(<4 x i16> %451, i64 %conv26) + store volatile <4 x i16> %453, <4 x i16>* %i16x4_r, align 8 + %454 = load i64, i64* %ul_a, align 8 + %455 = load i32, i32* %i_b, align 4 + %conv27 = sext i32 %455 to i64 + %456 = call i64 @llvm.riscv.kslra8.i64(i64 %454, i64 %conv27) + store volatile i64 %456, i64* %ul_r, align 8 + %457 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %458 = load i32, i32* %i_b, align 4 + %conv28 = sext i32 %458 to i64 + %459 = call <8 x i8> @llvm.riscv.v.kslra8.v8i8.i64(<8 x i8> %457, i64 %conv28) + store volatile <8 x i8> %459, <8 x i8>* %i8x8_r, align 8 + %460 = load i64, i64* %ul_a, align 8 + %461 = load i32, i32* %i_b, align 4 + %conv29 = sext i32 %461 to i64 + %462 = call i64 @llvm.riscv.kslra8.u.i64(i64 %460, i64 %conv29) + store volatile i64 %462, i64* %ul_r, align 8 + %463 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %464 = load i32, i32* %i_b, align 4 + %conv30 = sext i32 %464 to i64 + %465 = call <8 x i8> @llvm.riscv.v.kslra8.u.v8i8.i64(<8 x i8> %463, i64 %conv30) + store volatile <8 x i8> %465, <8 x i8>* %i8x8_r, align 8 + %466 = load i64, i64* %ul_a, align 8 + %467 = load i32, i32* %i_b, align 4 + %conv31 = sext i32 %467 to i64 + %468 = call i64 @llvm.riscv.kslra16.i64(i64 %466, i64 %conv31) + store volatile i64 %468, i64* %ul_r, align 8 + %469 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %470 = load i32, i32* %i_b, align 4 + %conv32 = sext i32 %470 to i64 + %471 = call <4 x i16> @llvm.riscv.v.kslra16.v4i16.i64(<4 x i16> %469, i64 %conv32) + store volatile <4 x i16> %471, <4 x i16>* %i16x4_r, align 8 + %472 = load i64, i64* %ul_a, align 8 + %473 = load i32, i32* %i_b, align 4 + %conv33 = sext i32 %473 to i64 + %474 = call i64 @llvm.riscv.kslra16.u.i64(i64 %472, i64 %conv33) + store volatile i64 %474, i64* %ul_r, align 8 + %475 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %476 = load i32, i32* %i_b, align 4 + %conv34 = sext i32 %476 to i64 + %477 = call <4 x i16> @llvm.riscv.v.kslra16.u.v4i16.i64(<4 x i16> %475, i64 %conv34) + store volatile <4 x i16> %477, <4 x i16>* %i16x4_r, align 8 + %478 = load i64, i64* %ul_a, align 8 + %479 = load i64, i64* %ul_b, align 8 + %480 = call i64 @llvm.riscv.kstas16.i64(i64 %478, i64 %479) + store volatile i64 %480, i64* %ul_r, align 8 + %481 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %482 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %483 = call <4 x i16> @llvm.riscv.v.kstas16.v4i16(<4 x i16> %481, <4 x i16> %482) + store volatile <4 x i16> %483, <4 x i16>* %i16x4_r, align 8 + %484 = load i64, i64* %ul_a, align 8 + %485 = load i64, i64* %ul_b, align 8 + %486 = call i64 @llvm.riscv.kstsa16.i64(i64 %484, i64 %485) + store volatile i64 %486, i64* %ul_r, align 8 + %487 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %488 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %489 = call <4 x i16> @llvm.riscv.v.kstsa16.v4i16(<4 x i16> %487, <4 x i16> %488) + store volatile <4 x i16> %489, <4 x i16>* %i16x4_r, align 8 + %490 = load i64, i64* %ul_a, align 8 + %491 = load i64, i64* %ul_b, align 8 + %492 = call i64 @llvm.riscv.ksub8.i64(i64 %490, i64 %491) + store volatile i64 %492, i64* %ul_r, align 8 + %493 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %494 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %495 = call <8 x i8> @llvm.riscv.v.ksub8.v8i8(<8 x i8> %493, <8 x i8> %494) + store volatile <8 x i8> %495, <8 x i8>* %i8x8_r, align 8 + %496 = load i64, i64* %ul_a, align 8 + %497 = load i64, i64* %ul_b, align 8 + %498 = call i64 @llvm.riscv.ksub16.i64(i64 %496, i64 %497) + store volatile i64 %498, i64* %ul_r, align 8 + %499 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %500 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %501 = call <4 x i16> @llvm.riscv.v.ksub16.v4i16(<4 x i16> %499, <4 x i16> %500) + store volatile <4 x i16> %501, <4 x i16>* %i16x4_r, align 8 + %502 = load i32, i32* %i_a, align 4 + %conv35 = sext i32 %502 to i64 + %503 = load i32, i32* %i_b, align 4 + %conv36 = sext i32 %503 to i64 + %504 = call i64 @llvm.riscv.ksubh.i64(i64 %conv35, i64 %conv36) + store volatile i64 %504, i64* %l_r, align 8 + %505 = load i32, i32* %i_a, align 4 + %conv37 = sext i32 %505 to i64 + %506 = load i32, i32* %i_b, align 4 + %conv38 = sext i32 %506 to i64 + %507 = call i64 @llvm.riscv.ksubw.i64(i64 %conv37, i64 %conv38) + store volatile i64 %507, i64* %l_r, align 8 + %508 = load i64, i64* %l_a, align 8 + %509 = load i64, i64* %l_b, align 8 + %510 = call i64 @llvm.riscv.kwmmul.i64(i64 %508, i64 %509) + store volatile i64 %510, i64* %l_r, align 8 + %511 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %512 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %513 = call <2 x i32> @llvm.riscv.v.kwmmul.v2i32(<2 x i32> %511, <2 x i32> %512) + store volatile <2 x i32> %513, <2 x i32>* %i32x2_r, align 8 + %514 = load i64, i64* %l_a, align 8 + %515 = load i64, i64* %l_b, align 8 + %516 = call i64 @llvm.riscv.kwmmul.u.i64(i64 %514, i64 %515) + store volatile i64 %516, i64* %l_r, align 8 + %517 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %518 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %519 = call <2 x i32> @llvm.riscv.v.kwmmul.u.v2i32(<2 x i32> %517, <2 x i32> %518) + store volatile <2 x i32> %519, <2 x i32>* %i32x2_r, align 8 + %520 = load i32, i32* %i_a, align 4 + %conv39 = sext i32 %520 to i64 + %521 = load i32, i32* %i_b, align 4 + %conv40 = sext i32 %521 to i64 + %522 = call i64 @llvm.riscv.maxw.i64(i64 %conv39, i64 %conv40) + store volatile i64 %522, i64* %l_r, align 8 + %523 = load i32, i32* %i_a, align 4 + %conv41 = sext i32 %523 to i64 + %524 = load i32, i32* %i_b, align 4 + %conv42 = sext i32 %524 to i64 + %525 = call i64 @llvm.riscv.minw.i64(i64 %conv41, i64 %conv42) + store volatile i64 %525, i64* %l_r, align 8 + %526 = load i64, i64* %ul_a, align 8 + %527 = load i64, i64* %ul_b, align 8 + %528 = call i64 @llvm.riscv.pbsad.i64(i64 %526, i64 %527) + store volatile i64 %528, i64* %ul_r, align 8 + %529 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %530 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %531 = call i64 @llvm.riscv.v.pbsad.i64.v8i8(<8 x i8> %529, <8 x i8> %530) + store volatile i64 %531, i64* %ul_r, align 8 + %532 = load i64, i64* %ul_t, align 8 + %533 = load i64, i64* %ul_a, align 8 + %534 = load i64, i64* %ul_b, align 8 + %535 = call i64 @llvm.riscv.pbsada.i64(i64 %532, i64 %533, i64 %534) + store volatile i64 %535, i64* %ul_r, align 8 + %536 = load i64, i64* %ul_t, align 8 + %537 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %538 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %539 = call i64 @llvm.riscv.v.pbsada.i64.v8i8(i64 %536, <8 x i8> %537, <8 x i8> %538) + store volatile i64 %539, i64* %ul_r, align 8 + %540 = load i64, i64* %ul_a, align 8 + %541 = load i64, i64* %ul_b, align 8 + %542 = call i64 @llvm.riscv.pkbb16.i64(i64 %540, i64 %541) + store volatile i64 %542, i64* %ul_r, align 8 + %543 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %544 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %545 = call <4 x i16> @llvm.riscv.v.pkbb16.v4i16(<4 x i16> %543, <4 x i16> %544) + store volatile <4 x i16> %545, <4 x i16>* %u16x4_r, align 8 + %546 = load i64, i64* %ul_a, align 8 + %547 = load i64, i64* %ul_b, align 8 + %548 = call i64 @llvm.riscv.pkbt16.i64(i64 %546, i64 %547) + store volatile i64 %548, i64* %ul_r, align 8 + %549 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %550 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %551 = call <4 x i16> @llvm.riscv.v.pkbt16.v4i16(<4 x i16> %549, <4 x i16> %550) + store volatile <4 x i16> %551, <4 x i16>* %u16x4_r, align 8 + %552 = load i64, i64* %ul_a, align 8 + %553 = load i64, i64* %ul_b, align 8 + %554 = call i64 @llvm.riscv.pktt16.i64(i64 %552, i64 %553) + store volatile i64 %554, i64* %ul_r, align 8 + %555 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %556 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %557 = call <4 x i16> @llvm.riscv.v.pktt16.v4i16(<4 x i16> %555, <4 x i16> %556) + store volatile <4 x i16> %557, <4 x i16>* %u16x4_r, align 8 + %558 = load i64, i64* %ul_a, align 8 + %559 = load i64, i64* %ul_b, align 8 + %560 = call i64 @llvm.riscv.pktb16.i64(i64 %558, i64 %559) + store volatile i64 %560, i64* %ul_r, align 8 + %561 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %562 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %563 = call <4 x i16> @llvm.riscv.v.pktb16.v4i16(<4 x i16> %561, <4 x i16> %562) + store volatile <4 x i16> %563, <4 x i16>* %u16x4_r, align 8 + %564 = load i64, i64* %ul_a, align 8 + %565 = load i64, i64* %ul_b, align 8 + %566 = call i64 @llvm.riscv.radd8.i64(i64 %564, i64 %565) + store volatile i64 %566, i64* %ul_r, align 8 + %567 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %568 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %569 = call <8 x i8> @llvm.riscv.v.radd8.v8i8(<8 x i8> %567, <8 x i8> %568) + store volatile <8 x i8> %569, <8 x i8>* %i8x8_r, align 8 + %570 = load i64, i64* %ul_a, align 8 + %571 = load i64, i64* %ul_b, align 8 + %572 = call i64 @llvm.riscv.radd16.i64(i64 %570, i64 %571) + store volatile i64 %572, i64* %ul_r, align 8 + %573 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %574 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %575 = call <4 x i16> @llvm.riscv.v.radd16.v4i16(<4 x i16> %573, <4 x i16> %574) + store volatile <4 x i16> %575, <4 x i16>* %i16x4_r, align 8 + %576 = load i32, i32* %i_a, align 4 + %conv43 = sext i32 %576 to i64 + %577 = load i32, i32* %i_b, align 4 + %conv44 = sext i32 %577 to i64 + %578 = call i64 @llvm.riscv.raddw.i64(i64 %conv43, i64 %conv44) + store volatile i64 %578, i64* %l_r, align 8 + %579 = load i64, i64* %ul_a, align 8 + %580 = load i64, i64* %ul_b, align 8 + %581 = call i64 @llvm.riscv.rcras16.i64(i64 %579, i64 %580) + store volatile i64 %581, i64* %ul_r, align 8 + %582 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %583 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %584 = call <4 x i16> @llvm.riscv.v.rcras16.v4i16(<4 x i16> %582, <4 x i16> %583) + store volatile <4 x i16> %584, <4 x i16>* %i16x4_r, align 8 + %585 = load i64, i64* %ul_a, align 8 + %586 = load i64, i64* %ul_b, align 8 + %587 = call i64 @llvm.riscv.rcrsa16.i64(i64 %585, i64 %586) + store volatile i64 %587, i64* %ul_r, align 8 + %588 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %589 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %590 = call <4 x i16> @llvm.riscv.v.rcrsa16.v4i16(<4 x i16> %588, <4 x i16> %589) + store volatile <4 x i16> %590, <4 x i16>* %i16x4_r, align 8 + %591 = load i64, i64* %ul_a, align 8 + %592 = load i64, i64* %ul_b, align 8 + %593 = call i64 @llvm.riscv.rstas16.i64(i64 %591, i64 %592) + store volatile i64 %593, i64* %ul_r, align 8 + %594 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %595 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %596 = call <4 x i16> @llvm.riscv.v.rstas16.v4i16(<4 x i16> %594, <4 x i16> %595) + store volatile <4 x i16> %596, <4 x i16>* %i16x4_r, align 8 + %597 = load i64, i64* %ul_a, align 8 + %598 = load i64, i64* %ul_b, align 8 + %599 = call i64 @llvm.riscv.rstsa16.i64(i64 %597, i64 %598) + store volatile i64 %599, i64* %ul_r, align 8 + %600 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %601 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %602 = call <4 x i16> @llvm.riscv.v.rstsa16.v4i16(<4 x i16> %600, <4 x i16> %601) + store volatile <4 x i16> %602, <4 x i16>* %i16x4_r, align 8 + %603 = load i64, i64* %ul_a, align 8 + %604 = load i64, i64* %ul_b, align 8 + %605 = call i64 @llvm.riscv.rsub8.i64(i64 %603, i64 %604) + store volatile i64 %605, i64* %ul_r, align 8 + %606 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %607 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %608 = call <8 x i8> @llvm.riscv.v.rsub8.v8i8(<8 x i8> %606, <8 x i8> %607) + store volatile <8 x i8> %608, <8 x i8>* %i8x8_r, align 8 + %609 = load i64, i64* %ul_a, align 8 + %610 = load i64, i64* %ul_b, align 8 + %611 = call i64 @llvm.riscv.rsub16.i64(i64 %609, i64 %610) + store volatile i64 %611, i64* %ul_r, align 8 + %612 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %613 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %614 = call <4 x i16> @llvm.riscv.v.rsub16.v4i16(<4 x i16> %612, <4 x i16> %613) + store volatile <4 x i16> %614, <4 x i16>* %i16x4_r, align 8 + %615 = load i32, i32* %i_a, align 4 + %conv45 = sext i32 %615 to i64 + %616 = load i32, i32* %i_b, align 4 + %conv46 = sext i32 %616 to i64 + %617 = call i64 @llvm.riscv.rsubw.i64(i64 %conv45, i64 %conv46) + store volatile i64 %617, i64* %l_r, align 8 + %618 = load i64, i64* %ul_a, align 8 + %619 = call i64 @llvm.riscv.sclip8.i64(i64 %618, i64 7) + store volatile i64 %619, i64* %ul_r, align 8 + %620 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %621 = call <8 x i8> @llvm.riscv.v.sclip8.v8i8.i64(<8 x i8> %620, i64 7) + store volatile <8 x i8> %621, <8 x i8>* %i8x8_r, align 8 + %622 = load i64, i64* %ul_a, align 8 + %623 = call i64 @llvm.riscv.sclip16.i64(i64 %622, i64 8) + store volatile i64 %623, i64* %ul_r, align 8 + %624 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %625 = call <4 x i16> @llvm.riscv.v.sclip16.v4i16.i64(<4 x i16> %624, i64 8) + store volatile <4 x i16> %625, <4 x i16>* %i16x4_r, align 8 + %626 = load i64, i64* %l_a, align 8 + %627 = call i64 @llvm.riscv.sclip32.i64(i64 %626, i64 9) + store volatile i64 %627, i64* %l_r, align 8 + %628 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %629 = call <2 x i32> @llvm.riscv.v.sclip32.v2i32.i64(<2 x i32> %628, i64 9) + store volatile <2 x i32> %629, <2 x i32>* %i32x2_r, align 8 + %630 = load i64, i64* %ul_a, align 8 + %631 = load i64, i64* %ul_b, align 8 + %632 = call i64 @llvm.riscv.scmple8.i64(i64 %630, i64 %631) + store volatile i64 %632, i64* %ul_r, align 8 + %633 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %634 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %635 = call <8 x i8> @llvm.riscv.v.scmple8.v8i8(<8 x i8> %633, <8 x i8> %634) + store volatile <8 x i8> %635, <8 x i8>* %u8x8_r, align 8 + %636 = load i64, i64* %ul_a, align 8 + %637 = load i64, i64* %ul_b, align 8 + %638 = call i64 @llvm.riscv.scmple16.i64(i64 %636, i64 %637) + store volatile i64 %638, i64* %ul_r, align 8 + %639 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %640 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %641 = call <4 x i16> @llvm.riscv.v.scmple16.v4i16(<4 x i16> %639, <4 x i16> %640) + store volatile <4 x i16> %641, <4 x i16>* %u16x4_r, align 8 + %642 = load i64, i64* %ul_a, align 8 + %643 = load i64, i64* %ul_b, align 8 + %644 = call i64 @llvm.riscv.scmplt8.i64(i64 %642, i64 %643) + store volatile i64 %644, i64* %ul_r, align 8 + %645 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %646 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %647 = call <8 x i8> @llvm.riscv.v.scmplt8.v8i8(<8 x i8> %645, <8 x i8> %646) + store volatile <8 x i8> %647, <8 x i8>* %u8x8_r, align 8 + %648 = load i64, i64* %ul_a, align 8 + %649 = load i64, i64* %ul_b, align 8 + %650 = call i64 @llvm.riscv.scmplt16.i64(i64 %648, i64 %649) + store volatile i64 %650, i64* %ul_r, align 8 + %651 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %652 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %653 = call <4 x i16> @llvm.riscv.v.scmplt16.v4i16(<4 x i16> %651, <4 x i16> %652) + store volatile <4 x i16> %653, <4 x i16>* %u16x4_r, align 8 + %654 = load i64, i64* %ul_a, align 8 + %655 = load i32, i32* %ui_b, align 4 + %conv47 = zext i32 %655 to i64 + %656 = call i64 @llvm.riscv.sll8.i64(i64 %654, i64 %conv47) + store volatile i64 %656, i64* %ul_r, align 8 + %657 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %658 = load i32, i32* %ui_b, align 4 + %conv48 = zext i32 %658 to i64 + %659 = call <8 x i8> @llvm.riscv.v.sll8.v8i8.i64(<8 x i8> %657, i64 %conv48) + store volatile <8 x i8> %659, <8 x i8>* %u8x8_r, align 8 + %660 = load i64, i64* %ul_a, align 8 + %661 = load i32, i32* %ui_b, align 4 + %conv49 = zext i32 %661 to i64 + %662 = call i64 @llvm.riscv.sll16.i64(i64 %660, i64 %conv49) + store volatile i64 %662, i64* %ul_r, align 8 + %663 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %664 = load i32, i32* %ui_b, align 4 + %conv50 = zext i32 %664 to i64 + %665 = call <4 x i16> @llvm.riscv.v.sll16.v4i16.i64(<4 x i16> %663, i64 %conv50) + store volatile <4 x i16> %665, <4 x i16>* %u16x4_r, align 8 + %666 = load i64, i64* %l_t, align 8 + %667 = load i64, i64* %ul_a, align 8 + %668 = load i64, i64* %ul_b, align 8 + %669 = call i64 @llvm.riscv.smaqa.i64(i64 %666, i64 %667, i64 %668) + store volatile i64 %669, i64* %l_r, align 8 + %670 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %671 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %672 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %673 = call <2 x i32> @llvm.riscv.v.smaqa.v2i32.v8i8(<2 x i32> %670, <8 x i8> %671, <8 x i8> %672) + store volatile <2 x i32> %673, <2 x i32>* %i32x2_r, align 8 + %674 = load i64, i64* %l_t, align 8 + %675 = load i64, i64* %ul_a, align 8 + %676 = load i64, i64* %ul_b, align 8 + %677 = call i64 @llvm.riscv.smaqa.su.i64(i64 %674, i64 %675, i64 %676) + store volatile i64 %677, i64* %l_r, align 8 + %678 = load <2 x i32>, <2 x i32>* %i32x2_t, align 8 + %679 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %680 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %681 = call <2 x i32> @llvm.riscv.v.smaqa.su.v2i32.v8i8(<2 x i32> %678, <8 x i8> %679, <8 x i8> %680) + store volatile <2 x i32> %681, <2 x i32>* %i32x2_r, align 8 + %682 = load i64, i64* %ul_a, align 8 + %683 = load i64, i64* %ul_b, align 8 + %684 = call i64 @llvm.riscv.smax8.i64(i64 %682, i64 %683) + store volatile i64 %684, i64* %ul_r, align 8 + %685 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %686 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %687 = call <8 x i8> @llvm.riscv.v.smax8.v8i8(<8 x i8> %685, <8 x i8> %686) + store volatile <8 x i8> %687, <8 x i8>* %i8x8_r, align 8 + %688 = load i64, i64* %ul_a, align 8 + %689 = load i64, i64* %ul_b, align 8 + %690 = call i64 @llvm.riscv.smax16.i64(i64 %688, i64 %689) + store volatile i64 %690, i64* %ul_r, align 8 + %691 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %692 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %693 = call <4 x i16> @llvm.riscv.v.smax16.v4i16(<4 x i16> %691, <4 x i16> %692) + store volatile <4 x i16> %693, <4 x i16>* %i16x4_r, align 8 + %694 = load i64, i64* %ul_a, align 8 + %695 = load i64, i64* %ul_b, align 8 + %696 = call i64 @llvm.riscv.smbb16.i64(i64 %694, i64 %695) + store volatile i64 %696, i64* %l_r, align 8 + %697 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %698 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %699 = call <2 x i32> @llvm.riscv.v.smbb16.v2i32.v4i16(<4 x i16> %697, <4 x i16> %698) + store volatile <2 x i32> %699, <2 x i32>* %i32x2_r, align 8 + %700 = load i64, i64* %ul_a, align 8 + %701 = load i64, i64* %ul_b, align 8 + %702 = call i64 @llvm.riscv.smbt16.i64(i64 %700, i64 %701) + store volatile i64 %702, i64* %l_r, align 8 + %703 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %704 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %705 = call <2 x i32> @llvm.riscv.v.smbt16.v2i32.v4i16(<4 x i16> %703, <4 x i16> %704) + store volatile <2 x i32> %705, <2 x i32>* %i32x2_r, align 8 + %706 = load i64, i64* %ul_a, align 8 + %707 = load i64, i64* %ul_b, align 8 + %708 = call i64 @llvm.riscv.smtt16.i64(i64 %706, i64 %707) + store volatile i64 %708, i64* %l_r, align 8 + %709 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %710 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %711 = call <2 x i32> @llvm.riscv.v.smtt16.v2i32.v4i16(<4 x i16> %709, <4 x i16> %710) + store volatile <2 x i32> %711, <2 x i32>* %i32x2_r, align 8 + %712 = load i64, i64* %ul_a, align 8 + %713 = load i64, i64* %ul_b, align 8 + %714 = call i64 @llvm.riscv.smds.i64(i64 %712, i64 %713) + store volatile i64 %714, i64* %l_r, align 8 + %715 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %716 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %717 = call <2 x i32> @llvm.riscv.v.smds.v2i32.v4i16(<4 x i16> %715, <4 x i16> %716) + store volatile <2 x i32> %717, <2 x i32>* %i32x2_r, align 8 + %718 = load i64, i64* %ul_a, align 8 + %719 = load i64, i64* %ul_b, align 8 + %720 = call i64 @llvm.riscv.smdrs.i64(i64 %718, i64 %719) + store volatile i64 %720, i64* %l_r, align 8 + %721 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %722 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %723 = call <2 x i32> @llvm.riscv.v.smdrs.v2i32.v4i16(<4 x i16> %721, <4 x i16> %722) + store volatile <2 x i32> %723, <2 x i32>* %i32x2_r, align 8 + %724 = load i64, i64* %ul_a, align 8 + %725 = load i64, i64* %ul_b, align 8 + %726 = call i64 @llvm.riscv.smxds.i64(i64 %724, i64 %725) + store volatile i64 %726, i64* %l_r, align 8 + %727 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %728 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %729 = call <2 x i32> @llvm.riscv.v.smxds.v2i32.v4i16(<4 x i16> %727, <4 x i16> %728) + store volatile <2 x i32> %729, <2 x i32>* %i32x2_r, align 8 + %730 = load i64, i64* %ul_a, align 8 + %731 = load i64, i64* %ul_b, align 8 + %732 = call i64 @llvm.riscv.smin8.i64(i64 %730, i64 %731) + store volatile i64 %732, i64* %ul_r, align 8 + %733 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %734 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %735 = call <8 x i8> @llvm.riscv.v.smin8.v8i8(<8 x i8> %733, <8 x i8> %734) + store volatile <8 x i8> %735, <8 x i8>* %i8x8_r, align 8 + %736 = load i64, i64* %ul_a, align 8 + %737 = load i64, i64* %ul_b, align 8 + %738 = call i64 @llvm.riscv.smin16.i64(i64 %736, i64 %737) + store volatile i64 %738, i64* %ul_r, align 8 + %739 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %740 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %741 = call <4 x i16> @llvm.riscv.v.smin16.v4i16(<4 x i16> %739, <4 x i16> %740) + store volatile <4 x i16> %741, <4 x i16>* %i16x4_r, align 8 + %742 = load i64, i64* %l_a, align 8 + %743 = load i64, i64* %l_b, align 8 + %744 = call i64 @llvm.riscv.smmul.i64(i64 %742, i64 %743) + store volatile i64 %744, i64* %l_r, align 8 + %745 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %746 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %747 = call <2 x i32> @llvm.riscv.v.smmul.v2i32(<2 x i32> %745, <2 x i32> %746) + store volatile <2 x i32> %747, <2 x i32>* %i32x2_r, align 8 + %748 = load i64, i64* %l_a, align 8 + %749 = load i64, i64* %l_b, align 8 + %750 = call i64 @llvm.riscv.smmul.u.i64(i64 %748, i64 %749) + store volatile i64 %750, i64* %l_r, align 8 + %751 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %752 = load <2 x i32>, <2 x i32>* %i32x2_b, align 8 + %753 = call <2 x i32> @llvm.riscv.v.smmul.u.v2i32(<2 x i32> %751, <2 x i32> %752) + store volatile <2 x i32> %753, <2 x i32>* %i32x2_r, align 8 + %754 = load i64, i64* %l_a, align 8 + %755 = load i64, i64* %ul_b, align 8 + %756 = call i64 @llvm.riscv.smmwb.i64(i64 %754, i64 %755) + store volatile i64 %756, i64* %l_r, align 8 + %757 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %758 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %759 = call <2 x i32> @llvm.riscv.v.smmwb.v2i32.v4i16(<2 x i32> %757, <4 x i16> %758) + store volatile <2 x i32> %759, <2 x i32>* %i32x2_r, align 8 + %760 = load i64, i64* %l_a, align 8 + %761 = load i64, i64* %ul_b, align 8 + %762 = call i64 @llvm.riscv.smmwb.u.i64(i64 %760, i64 %761) + store volatile i64 %762, i64* %l_r, align 8 + %763 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %764 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %765 = call <2 x i32> @llvm.riscv.v.smmwb.u.v2i32.v4i16(<2 x i32> %763, <4 x i16> %764) + store volatile <2 x i32> %765, <2 x i32>* %i32x2_r, align 8 + %766 = load i64, i64* %l_a, align 8 + %767 = load i64, i64* %ul_b, align 8 + %768 = call i64 @llvm.riscv.smmwt.i64(i64 %766, i64 %767) + store volatile i64 %768, i64* %l_r, align 8 + %769 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %770 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %771 = call <2 x i32> @llvm.riscv.v.smmwt.v2i32.v4i16(<2 x i32> %769, <4 x i16> %770) + store volatile <2 x i32> %771, <2 x i32>* %i32x2_r, align 8 + %772 = load i64, i64* %l_a, align 8 + %773 = load i64, i64* %ul_b, align 8 + %774 = call i64 @llvm.riscv.smmwt.u.i64(i64 %772, i64 %773) + store volatile i64 %774, i64* %l_r, align 8 + %775 = load <2 x i32>, <2 x i32>* %i32x2_a, align 8 + %776 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %777 = call <2 x i32> @llvm.riscv.v.smmwt.u.v2i32.v4i16(<2 x i32> %775, <4 x i16> %776) + store volatile <2 x i32> %777, <2 x i32>* %i32x2_r, align 8 + %778 = load i64, i64* %l_a, align 8 + %779 = load i32, i32* %ui_b, align 4 + %conv51 = zext i32 %779 to i64 + %780 = call i64 @llvm.riscv.sra.u.i64(i64 %778, i64 %conv51) + store volatile i64 %780, i64* %l_r, align 8 + %781 = load i64, i64* %ul_a, align 8 + %782 = load i32, i32* %ui_b, align 4 + %conv52 = zext i32 %782 to i64 + %783 = call i64 @llvm.riscv.sra8.i64(i64 %781, i64 %conv52) + store volatile i64 %783, i64* %ul_r, align 8 + %784 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %785 = load i32, i32* %ui_b, align 4 + %conv53 = zext i32 %785 to i64 + %786 = call <8 x i8> @llvm.riscv.v.sra8.v8i8.i64(<8 x i8> %784, i64 %conv53) + store volatile <8 x i8> %786, <8 x i8>* %i8x8_r, align 8 + %787 = load i64, i64* %ul_a, align 8 + %788 = load i32, i32* %ui_b, align 4 + %conv54 = zext i32 %788 to i64 + %789 = call i64 @llvm.riscv.sra8.u.i64(i64 %787, i64 %conv54) + store volatile i64 %789, i64* %ul_r, align 8 + %790 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %791 = load i32, i32* %ui_b, align 4 + %conv55 = zext i32 %791 to i64 + %792 = call <8 x i8> @llvm.riscv.v.sra8.u.v8i8.i64(<8 x i8> %790, i64 %conv55) + store volatile <8 x i8> %792, <8 x i8>* %i8x8_r, align 8 + %793 = load i64, i64* %ul_a, align 8 + %794 = load i32, i32* %ui_b, align 4 + %conv56 = zext i32 %794 to i64 + %795 = call i64 @llvm.riscv.sra16.i64(i64 %793, i64 %conv56) + store volatile i64 %795, i64* %ul_r, align 8 + %796 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %797 = load i32, i32* %ui_b, align 4 + %conv57 = zext i32 %797 to i64 + %798 = call <4 x i16> @llvm.riscv.v.sra16.v4i16.i64(<4 x i16> %796, i64 %conv57) + store volatile <4 x i16> %798, <4 x i16>* %i16x4_r, align 8 + %799 = load i64, i64* %ul_a, align 8 + %800 = load i32, i32* %ui_b, align 4 + %conv58 = zext i32 %800 to i64 + %801 = call i64 @llvm.riscv.sra16.u.i64(i64 %799, i64 %conv58) + store volatile i64 %801, i64* %ul_r, align 8 + %802 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %803 = load i32, i32* %ui_b, align 4 + %conv59 = zext i32 %803 to i64 + %804 = call <4 x i16> @llvm.riscv.v.sra16.u.v4i16.i64(<4 x i16> %802, i64 %conv59) + store volatile <4 x i16> %804, <4 x i16>* %i16x4_r, align 8 + %805 = load i64, i64* %ul_a, align 8 + %806 = load i32, i32* %ui_b, align 4 + %conv60 = zext i32 %806 to i64 + %807 = call i64 @llvm.riscv.srl8.i64(i64 %805, i64 %conv60) + store volatile i64 %807, i64* %ul_r, align 8 + %808 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %809 = load i32, i32* %ui_b, align 4 + %conv61 = zext i32 %809 to i64 + %810 = call <8 x i8> @llvm.riscv.v.srl8.v8i8.i64(<8 x i8> %808, i64 %conv61) + store volatile <8 x i8> %810, <8 x i8>* %u8x8_r, align 8 + %811 = load i64, i64* %ul_a, align 8 + %812 = load i32, i32* %ui_b, align 4 + %conv62 = zext i32 %812 to i64 + %813 = call i64 @llvm.riscv.srl8.u.i64(i64 %811, i64 %conv62) + store volatile i64 %813, i64* %ul_r, align 8 + %814 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %815 = load i32, i32* %ui_b, align 4 + %conv63 = zext i32 %815 to i64 + %816 = call <8 x i8> @llvm.riscv.v.srl8.u.v8i8.i64(<8 x i8> %814, i64 %conv63) + store volatile <8 x i8> %816, <8 x i8>* %u8x8_r, align 8 + %817 = load i64, i64* %ul_a, align 8 + %818 = load i32, i32* %ui_b, align 4 + %conv64 = zext i32 %818 to i64 + %819 = call i64 @llvm.riscv.srl16.i64(i64 %817, i64 %conv64) + store volatile i64 %819, i64* %ul_r, align 8 + %820 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %821 = load i32, i32* %ui_b, align 4 + %conv65 = zext i32 %821 to i64 + %822 = call <4 x i16> @llvm.riscv.v.srl16.v4i16.i64(<4 x i16> %820, i64 %conv65) + store volatile <4 x i16> %822, <4 x i16>* %u16x4_r, align 8 + %823 = load i64, i64* %ul_a, align 8 + %824 = load i32, i32* %ui_b, align 4 + %conv66 = zext i32 %824 to i64 + %825 = call i64 @llvm.riscv.srl16.u.i64(i64 %823, i64 %conv66) + store volatile i64 %825, i64* %ul_r, align 8 + %826 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %827 = load i32, i32* %ui_b, align 4 + %conv67 = zext i32 %827 to i64 + %828 = call <4 x i16> @llvm.riscv.v.srl16.u.v4i16.i64(<4 x i16> %826, i64 %conv67) + store volatile <4 x i16> %828, <4 x i16>* %u16x4_r, align 8 + %829 = load i64, i64* %ul_a, align 8 + %830 = load i64, i64* %ul_b, align 8 + %831 = call i64 @llvm.riscv.stas16.i64(i64 %829, i64 %830) + store volatile i64 %831, i64* %ul_r, align 8 + %832 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %833 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %834 = call <4 x i16> @llvm.riscv.v.stas16.v4i16(<4 x i16> %832, <4 x i16> %833) + store volatile <4 x i16> %834, <4 x i16>* %u16x4_r, align 8 + %835 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %836 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %837 = call <4 x i16> @llvm.riscv.v.stas16.v4i16(<4 x i16> %835, <4 x i16> %836) + store volatile <4 x i16> %837, <4 x i16>* %i16x4_r, align 8 + %838 = load i64, i64* %ul_a, align 8 + %839 = load i64, i64* %ul_b, align 8 + %840 = call i64 @llvm.riscv.stsa16.i64(i64 %838, i64 %839) + store volatile i64 %840, i64* %ul_r, align 8 + %841 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %842 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %843 = call <4 x i16> @llvm.riscv.v.stsa16.v4i16(<4 x i16> %841, <4 x i16> %842) + store volatile <4 x i16> %843, <4 x i16>* %u16x4_r, align 8 + %844 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %845 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %846 = call <4 x i16> @llvm.riscv.v.stsa16.v4i16(<4 x i16> %844, <4 x i16> %845) + store volatile <4 x i16> %846, <4 x i16>* %i16x4_r, align 8 + %847 = load i64, i64* %ul_a, align 8 + %848 = load i64, i64* %ul_b, align 8 + %849 = call i64 @llvm.riscv.sub8.i64(i64 %847, i64 %848) + store volatile i64 %849, i64* %ul_r, align 8 + %850 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %851 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %852 = call <8 x i8> @llvm.riscv.v.sub8.v8i8(<8 x i8> %850, <8 x i8> %851) + store volatile <8 x i8> %852, <8 x i8>* %u8x8_r, align 8 + %853 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %854 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %855 = call <8 x i8> @llvm.riscv.v.sub8.v8i8(<8 x i8> %853, <8 x i8> %854) + store volatile <8 x i8> %855, <8 x i8>* %i8x8_r, align 8 + %856 = load i64, i64* %ul_a, align 8 + %857 = load i64, i64* %ul_b, align 8 + %858 = call i64 @llvm.riscv.sub16.i64(i64 %856, i64 %857) + store volatile i64 %858, i64* %ul_r, align 8 + %859 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %860 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %861 = call <4 x i16> @llvm.riscv.v.sub16.v4i16(<4 x i16> %859, <4 x i16> %860) + store volatile <4 x i16> %861, <4 x i16>* %u16x4_r, align 8 + %862 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %863 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %864 = call <4 x i16> @llvm.riscv.v.sub16.v4i16(<4 x i16> %862, <4 x i16> %863) + store volatile <4 x i16> %864, <4 x i16>* %i16x4_r, align 8 + %865 = load i64, i64* %ul_a, align 8 + %866 = call i64 @llvm.riscv.sunpkd810.i64(i64 %865) + store volatile i64 %866, i64* %ul_r, align 8 + %867 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %868 = call <4 x i16> @llvm.riscv.v.sunpkd810.v4i16(<8 x i8> %867) + store volatile <4 x i16> %868, <4 x i16>* %i16x4_r, align 8 + %869 = load i64, i64* %ul_a, align 8 + %870 = call i64 @llvm.riscv.sunpkd820.i64(i64 %869) + store volatile i64 %870, i64* %ul_r, align 8 + %871 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %872 = call <4 x i16> @llvm.riscv.v.sunpkd820.v4i16(<8 x i8> %871) + store volatile <4 x i16> %872, <4 x i16>* %i16x4_r, align 8 + %873 = load i64, i64* %ul_a, align 8 + %874 = call i64 @llvm.riscv.sunpkd830.i64(i64 %873) + store volatile i64 %874, i64* %ul_r, align 8 + %875 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %876 = call <4 x i16> @llvm.riscv.v.sunpkd830.v4i16(<8 x i8> %875) + store volatile <4 x i16> %876, <4 x i16>* %i16x4_r, align 8 + %877 = load i64, i64* %ul_a, align 8 + %878 = call i64 @llvm.riscv.sunpkd831.i64(i64 %877) + store volatile i64 %878, i64* %ul_r, align 8 + %879 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %880 = call <4 x i16> @llvm.riscv.v.sunpkd831.v4i16(<8 x i8> %879) + store volatile <4 x i16> %880, <4 x i16>* %i16x4_r, align 8 + %881 = load i64, i64* %ul_a, align 8 + %882 = call i64 @llvm.riscv.sunpkd832.i64(i64 %881) + store volatile i64 %882, i64* %ul_r, align 8 + %883 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %884 = call <4 x i16> @llvm.riscv.v.sunpkd832.v4i16(<8 x i8> %883) + store volatile <4 x i16> %884, <4 x i16>* %i16x4_r, align 8 + %885 = load i64, i64* %ul_a, align 8 + %886 = call i64 @llvm.riscv.swap8.i64(i64 %885) + store volatile i64 %886, i64* %ul_r, align 8 + %887 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %888 = call <8 x i8> @llvm.riscv.v.swap8.v8i8(<8 x i8> %887) + store volatile <8 x i8> %888, <8 x i8>* %u8x8_r, align 8 + %889 = load i64, i64* %ul_a, align 8 + %890 = call i64 @llvm.riscv.swap16.i64(i64 %889) + store volatile i64 %890, i64* %ul_r, align 8 + %891 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %892 = call <4 x i16> @llvm.riscv.v.swap16.v4i16(<4 x i16> %891) + store volatile <4 x i16> %892, <4 x i16>* %u16x4_r, align 8 + %893 = load i64, i64* %ul_a, align 8 + %894 = call i64 @llvm.riscv.uclip8.i64(i64 %893, i64 7) + store volatile i64 %894, i64* %ul_r, align 8 + %895 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %896 = call <8 x i8> @llvm.riscv.v.uclip8.v8i8.i64(<8 x i8> %895, i64 7) + store volatile <8 x i8> %896, <8 x i8>* %u8x8_r, align 8 + %897 = load i64, i64* %ul_a, align 8 + %898 = call i64 @llvm.riscv.uclip16.i64(i64 %897, i64 8) + store volatile i64 %898, i64* %ul_r, align 8 + %899 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %900 = call <4 x i16> @llvm.riscv.v.uclip16.v4i16.i64(<4 x i16> %899, i64 8) + store volatile <4 x i16> %900, <4 x i16>* %u16x4_r, align 8 + %901 = load i64, i64* %l_a, align 8 + %902 = call i64 @llvm.riscv.uclip32.i64(i64 %901, i64 9) + store volatile i64 %902, i64* %l_r, align 8 + %903 = load <2 x i32>, <2 x i32>* %u32x2_a, align 8 + %904 = call <2 x i32> @llvm.riscv.v.uclip32.v2i32.i64(<2 x i32> %903, i64 9) + store volatile <2 x i32> %904, <2 x i32>* %u32x2_r, align 8 + %905 = load i64, i64* %ul_a, align 8 + %906 = load i64, i64* %ul_b, align 8 + %907 = call i64 @llvm.riscv.ucmple8.i64(i64 %905, i64 %906) + store volatile i64 %907, i64* %ul_r, align 8 + %908 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %909 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %910 = call <8 x i8> @llvm.riscv.v.ucmple8.v8i8(<8 x i8> %908, <8 x i8> %909) + store volatile <8 x i8> %910, <8 x i8>* %u8x8_r, align 8 + %911 = load i64, i64* %ul_a, align 8 + %912 = load i64, i64* %ul_b, align 8 + %913 = call i64 @llvm.riscv.ucmple16.i64(i64 %911, i64 %912) + store volatile i64 %913, i64* %ul_r, align 8 + %914 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %915 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %916 = call <4 x i16> @llvm.riscv.v.ucmple16.v4i16(<4 x i16> %914, <4 x i16> %915) + store volatile <4 x i16> %916, <4 x i16>* %u16x4_r, align 8 + %917 = load i64, i64* %ul_a, align 8 + %918 = load i64, i64* %ul_b, align 8 + %919 = call i64 @llvm.riscv.ucmplt8.i64(i64 %917, i64 %918) + store volatile i64 %919, i64* %ul_r, align 8 + %920 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %921 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %922 = call <8 x i8> @llvm.riscv.v.ucmplt8.v8i8(<8 x i8> %920, <8 x i8> %921) + store volatile <8 x i8> %922, <8 x i8>* %u8x8_r, align 8 + %923 = load i64, i64* %ul_a, align 8 + %924 = load i64, i64* %ul_b, align 8 + %925 = call i64 @llvm.riscv.ucmplt16.i64(i64 %923, i64 %924) + store volatile i64 %925, i64* %ul_r, align 8 + %926 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %927 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %928 = call <4 x i16> @llvm.riscv.v.ucmplt16.v4i16(<4 x i16> %926, <4 x i16> %927) + store volatile <4 x i16> %928, <4 x i16>* %u16x4_r, align 8 + %929 = load i64, i64* %ul_a, align 8 + %930 = load i64, i64* %ul_b, align 8 + %931 = call i64 @llvm.riscv.ukadd8.i64(i64 %929, i64 %930) + store volatile i64 %931, i64* %ul_r, align 8 + %932 = load <8 x i8>, <8 x i8>* %i8x8_a, align 8 + %933 = load <8 x i8>, <8 x i8>* %i8x8_b, align 8 + %934 = call <8 x i8> @llvm.riscv.v.ukadd8.v8i8(<8 x i8> %932, <8 x i8> %933) + store volatile <8 x i8> %934, <8 x i8>* %i8x8_r, align 8 + %935 = load i64, i64* %ul_a, align 8 + %936 = load i64, i64* %ul_b, align 8 + %937 = call i64 @llvm.riscv.ukadd16.i64(i64 %935, i64 %936) + store volatile i64 %937, i64* %ul_r, align 8 + %938 = load <4 x i16>, <4 x i16>* %i16x4_a, align 8 + %939 = load <4 x i16>, <4 x i16>* %i16x4_b, align 8 + %940 = call <4 x i16> @llvm.riscv.v.ukadd16.v4i16(<4 x i16> %938, <4 x i16> %939) + store volatile <4 x i16> %940, <4 x i16>* %i16x4_r, align 8 + %941 = load i32, i32* %ui_a, align 4 + %conv68 = zext i32 %941 to i64 + %942 = load i32, i32* %ui_b, align 4 + %conv69 = zext i32 %942 to i64 + %943 = call i64 @llvm.riscv.ukaddh.i64(i64 %conv68, i64 %conv69) + store volatile i64 %943, i64* %ul_r, align 8 + %944 = load i32, i32* %ui_a, align 4 + %conv70 = zext i32 %944 to i64 + %945 = load i32, i32* %ui_b, align 4 + %conv71 = zext i32 %945 to i64 + %946 = call i64 @llvm.riscv.ukaddw.i64(i64 %conv70, i64 %conv71) + store volatile i64 %946, i64* %ul_r, align 8 + %947 = load i64, i64* %ul_a, align 8 + %948 = load i64, i64* %ul_b, align 8 + %949 = call i64 @llvm.riscv.ukcras16.i64(i64 %947, i64 %948) + store volatile i64 %949, i64* %ul_r, align 8 + %950 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %951 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %952 = call <4 x i16> @llvm.riscv.v.ukcras16.v4i16(<4 x i16> %950, <4 x i16> %951) + store volatile <4 x i16> %952, <4 x i16>* %u16x4_r, align 8 + %953 = load i64, i64* %ul_a, align 8 + %954 = load i64, i64* %ul_b, align 8 + %955 = call i64 @llvm.riscv.ukcrsa16.i64(i64 %953, i64 %954) + store volatile i64 %955, i64* %ul_r, align 8 + %956 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %957 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %958 = call <4 x i16> @llvm.riscv.v.ukcrsa16.v4i16(<4 x i16> %956, <4 x i16> %957) + store volatile <4 x i16> %958, <4 x i16>* %u16x4_r, align 8 + %959 = load i64, i64* %ul_a, align 8 + %960 = load i64, i64* %ul_b, align 8 + %961 = call i64 @llvm.riscv.ukstas16.i64(i64 %959, i64 %960) + store volatile i64 %961, i64* %ul_r, align 8 + %962 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %963 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %964 = call <4 x i16> @llvm.riscv.v.ukstas16.v4i16(<4 x i16> %962, <4 x i16> %963) + store volatile <4 x i16> %964, <4 x i16>* %u16x4_r, align 8 + %965 = load i64, i64* %ul_a, align 8 + %966 = load i64, i64* %ul_b, align 8 + %967 = call i64 @llvm.riscv.ukstsa16.i64(i64 %965, i64 %966) + store volatile i64 %967, i64* %ul_r, align 8 + %968 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %969 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %970 = call <4 x i16> @llvm.riscv.v.ukstsa16.v4i16(<4 x i16> %968, <4 x i16> %969) + store volatile <4 x i16> %970, <4 x i16>* %u16x4_r, align 8 + %971 = load i64, i64* %ul_a, align 8 + %972 = load i64, i64* %ul_b, align 8 + %973 = call i64 @llvm.riscv.uksub8.i64(i64 %971, i64 %972) + store volatile i64 %973, i64* %ul_r, align 8 + %974 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %975 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %976 = call <8 x i8> @llvm.riscv.v.uksub8.v8i8(<8 x i8> %974, <8 x i8> %975) + store volatile <8 x i8> %976, <8 x i8>* %u8x8_r, align 8 + %977 = load i64, i64* %ul_a, align 8 + %978 = load i64, i64* %ul_b, align 8 + %979 = call i64 @llvm.riscv.uksub16.i64(i64 %977, i64 %978) + store volatile i64 %979, i64* %ul_r, align 8 + %980 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %981 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %982 = call <4 x i16> @llvm.riscv.v.uksub16.v4i16(<4 x i16> %980, <4 x i16> %981) + store volatile <4 x i16> %982, <4 x i16>* %u16x4_r, align 8 + %983 = load i32, i32* %ui_a, align 4 + %conv72 = zext i32 %983 to i64 + %984 = load i32, i32* %ui_b, align 4 + %conv73 = zext i32 %984 to i64 + %985 = call i64 @llvm.riscv.uksubh.i64(i64 %conv72, i64 %conv73) + store volatile i64 %985, i64* %ul_r, align 8 + %986 = load i32, i32* %ui_a, align 4 + %conv74 = zext i32 %986 to i64 + %987 = load i32, i32* %ui_b, align 4 + %conv75 = zext i32 %987 to i64 + %988 = call i64 @llvm.riscv.uksubw.i64(i64 %conv74, i64 %conv75) + store volatile i64 %988, i64* %ul_r, align 8 + %989 = load i64, i64* %ul_t, align 8 + %990 = load i64, i64* %ul_a, align 8 + %991 = load i64, i64* %ul_b, align 8 + %992 = call i64 @llvm.riscv.umaqa.i64(i64 %989, i64 %990, i64 %991) + store volatile i64 %992, i64* %ul_r, align 8 + %993 = load <2 x i32>, <2 x i32>* %u32x2_t, align 8 + %994 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %995 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %996 = call <2 x i32> @llvm.riscv.v.umaqa.v2i32.v8i8(<2 x i32> %993, <8 x i8> %994, <8 x i8> %995) + store volatile <2 x i32> %996, <2 x i32>* %u32x2_r, align 8 + %997 = load i64, i64* %ul_a, align 8 + %998 = load i64, i64* %ul_b, align 8 + %999 = call i64 @llvm.riscv.umax8.i64(i64 %997, i64 %998) + store volatile i64 %999, i64* %ul_r, align 8 + %1000 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1001 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %1002 = call <8 x i8> @llvm.riscv.v.umax8.v8i8(<8 x i8> %1000, <8 x i8> %1001) + store volatile <8 x i8> %1002, <8 x i8>* %u8x8_r, align 8 + %1003 = load i64, i64* %ul_a, align 8 + %1004 = load i64, i64* %ul_b, align 8 + %1005 = call i64 @llvm.riscv.umax16.i64(i64 %1003, i64 %1004) + store volatile i64 %1005, i64* %ul_r, align 8 + %1006 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1007 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1008 = call <4 x i16> @llvm.riscv.v.umax16.v4i16(<4 x i16> %1006, <4 x i16> %1007) + store volatile <4 x i16> %1008, <4 x i16>* %u16x4_r, align 8 + %1009 = load i64, i64* %ul_a, align 8 + %1010 = load i64, i64* %ul_b, align 8 + %1011 = call i64 @llvm.riscv.umin8.i64(i64 %1009, i64 %1010) + store volatile i64 %1011, i64* %ul_r, align 8 + %1012 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1013 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %1014 = call <8 x i8> @llvm.riscv.v.umin8.v8i8(<8 x i8> %1012, <8 x i8> %1013) + store volatile <8 x i8> %1014, <8 x i8>* %u8x8_r, align 8 + %1015 = load i64, i64* %ul_a, align 8 + %1016 = load i64, i64* %ul_b, align 8 + %1017 = call i64 @llvm.riscv.umin16.i64(i64 %1015, i64 %1016) + store volatile i64 %1017, i64* %ul_r, align 8 + %1018 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1019 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1020 = call <4 x i16> @llvm.riscv.v.umin16.v4i16(<4 x i16> %1018, <4 x i16> %1019) + store volatile <4 x i16> %1020, <4 x i16>* %u16x4_r, align 8 + %1021 = load i64, i64* %ul_a, align 8 + %1022 = load i64, i64* %ul_b, align 8 + %1023 = call i64 @llvm.riscv.uradd8.i64(i64 %1021, i64 %1022) + store volatile i64 %1023, i64* %ul_r, align 8 + %1024 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1025 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %1026 = call <8 x i8> @llvm.riscv.v.uradd8.v8i8(<8 x i8> %1024, <8 x i8> %1025) + store volatile <8 x i8> %1026, <8 x i8>* %u8x8_r, align 8 + %1027 = load i64, i64* %ul_a, align 8 + %1028 = load i64, i64* %ul_b, align 8 + %1029 = call i64 @llvm.riscv.uradd16.i64(i64 %1027, i64 %1028) + store volatile i64 %1029, i64* %ul_r, align 8 + %1030 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1031 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1032 = call <4 x i16> @llvm.riscv.v.uradd16.v4i16(<4 x i16> %1030, <4 x i16> %1031) + store volatile <4 x i16> %1032, <4 x i16>* %u16x4_r, align 8 + %1033 = load i32, i32* %ui_a, align 4 + %conv76 = zext i32 %1033 to i64 + %1034 = load i32, i32* %ui_b, align 4 + %conv77 = zext i32 %1034 to i64 + %1035 = call i64 @llvm.riscv.uraddw.i64(i64 %conv76, i64 %conv77) + store volatile i64 %1035, i64* %ul_r, align 8 + %1036 = load i64, i64* %ul_a, align 8 + %1037 = load i64, i64* %ul_b, align 8 + %1038 = call i64 @llvm.riscv.urcras16.i64(i64 %1036, i64 %1037) + store volatile i64 %1038, i64* %ul_r, align 8 + %1039 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1040 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1041 = call <4 x i16> @llvm.riscv.v.urcras16.v4i16(<4 x i16> %1039, <4 x i16> %1040) + store volatile <4 x i16> %1041, <4 x i16>* %u16x4_r, align 8 + %1042 = load i64, i64* %ul_a, align 8 + %1043 = load i64, i64* %ul_b, align 8 + %1044 = call i64 @llvm.riscv.urcrsa16.i64(i64 %1042, i64 %1043) + store volatile i64 %1044, i64* %ul_r, align 8 + %1045 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1046 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1047 = call <4 x i16> @llvm.riscv.v.urcrsa16.v4i16(<4 x i16> %1045, <4 x i16> %1046) + store volatile <4 x i16> %1047, <4 x i16>* %u16x4_r, align 8 + %1048 = load i64, i64* %ul_a, align 8 + %1049 = load i64, i64* %ul_b, align 8 + %1050 = call i64 @llvm.riscv.urstas16.i64(i64 %1048, i64 %1049) + store volatile i64 %1050, i64* %ul_r, align 8 + %1051 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1052 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1053 = call <4 x i16> @llvm.riscv.v.urstas16.v4i16(<4 x i16> %1051, <4 x i16> %1052) + store volatile <4 x i16> %1053, <4 x i16>* %u16x4_r, align 8 + %1054 = load i64, i64* %ul_a, align 8 + %1055 = load i64, i64* %ul_b, align 8 + %1056 = call i64 @llvm.riscv.urstsa16.i64(i64 %1054, i64 %1055) + store volatile i64 %1056, i64* %ul_r, align 8 + %1057 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1058 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1059 = call <4 x i16> @llvm.riscv.v.urstsa16.v4i16(<4 x i16> %1057, <4 x i16> %1058) + store volatile <4 x i16> %1059, <4 x i16>* %u16x4_r, align 8 + %1060 = load i64, i64* %ul_a, align 8 + %1061 = load i64, i64* %ul_b, align 8 + %1062 = call i64 @llvm.riscv.ursub8.i64(i64 %1060, i64 %1061) + store volatile i64 %1062, i64* %ul_r, align 8 + %1063 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1064 = load <8 x i8>, <8 x i8>* %u8x8_b, align 8 + %1065 = call <8 x i8> @llvm.riscv.v.ursub8.v8i8(<8 x i8> %1063, <8 x i8> %1064) + store volatile <8 x i8> %1065, <8 x i8>* %u8x8_r, align 8 + %1066 = load i64, i64* %ul_a, align 8 + %1067 = load i64, i64* %ul_b, align 8 + %1068 = call i64 @llvm.riscv.ursub16.i64(i64 %1066, i64 %1067) + store volatile i64 %1068, i64* %ul_r, align 8 + %1069 = load <4 x i16>, <4 x i16>* %u16x4_a, align 8 + %1070 = load <4 x i16>, <4 x i16>* %u16x4_b, align 8 + %1071 = call <4 x i16> @llvm.riscv.v.ursub16.v4i16(<4 x i16> %1069, <4 x i16> %1070) + store volatile <4 x i16> %1071, <4 x i16>* %u16x4_r, align 8 + %1072 = load i32, i32* %ui_a, align 4 + %conv78 = zext i32 %1072 to i64 + %1073 = load i32, i32* %ui_b, align 4 + %conv79 = zext i32 %1073 to i64 + %1074 = call i64 @llvm.riscv.ursubw.i64(i64 %conv78, i64 %conv79) + store volatile i64 %1074, i64* %ul_r, align 8 + %1075 = load i64, i64* %ul_a, align 8 + %1076 = call i64 @llvm.riscv.zunpkd810.i64(i64 %1075) + store volatile i64 %1076, i64* %ul_r, align 8 + %1077 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1078 = call <4 x i16> @llvm.riscv.v.zunpkd810.v4i16(<8 x i8> %1077) + store volatile <4 x i16> %1078, <4 x i16>* %u16x4_r, align 8 + %1079 = load i64, i64* %ul_a, align 8 + %1080 = call i64 @llvm.riscv.zunpkd820.i64(i64 %1079) + store volatile i64 %1080, i64* %ul_r, align 8 + %1081 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1082 = call <4 x i16> @llvm.riscv.v.zunpkd820.v4i16(<8 x i8> %1081) + store volatile <4 x i16> %1082, <4 x i16>* %u16x4_r, align 8 + %1083 = load i64, i64* %ul_a, align 8 + %1084 = call i64 @llvm.riscv.zunpkd830.i64(i64 %1083) + store volatile i64 %1084, i64* %ul_r, align 8 + %1085 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1086 = call <4 x i16> @llvm.riscv.v.zunpkd830.v4i16(<8 x i8> %1085) + store volatile <4 x i16> %1086, <4 x i16>* %u16x4_r, align 8 + %1087 = load i64, i64* %ul_a, align 8 + %1088 = call i64 @llvm.riscv.zunpkd831.i64(i64 %1087) + store volatile i64 %1088, i64* %ul_r, align 8 + %1089 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1090 = call <4 x i16> @llvm.riscv.v.zunpkd831.v4i16(<8 x i8> %1089) + store volatile <4 x i16> %1090, <4 x i16>* %u16x4_r, align 8 + %1091 = load i64, i64* %ul_a, align 8 + %1092 = call i64 @llvm.riscv.zunpkd832.i64(i64 %1091) + store volatile i64 %1092, i64* %ul_r, align 8 + %1093 = load <8 x i8>, <8 x i8>* %u8x8_a, align 8 + %1094 = call <4 x i16> @llvm.riscv.v.zunpkd832.v4i16(<8 x i8> %1093) + store volatile <4 x i16> %1094, <4 x i16>* %u16x4_r, align 8 + ret void +} + +declare i64 @llvm.riscv.add8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.add8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.add16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.add16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ave.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.bitrev.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.bpick.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.clrs8.i64(i64) nounwind + +declare <8 x i8> @llvm.riscv.v.clrs8.v8i8(<8 x i8>) nounwind + +declare i64 @llvm.riscv.clrs16.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.clrs16.v4i16(<4 x i16>) nounwind + +declare i64 @llvm.riscv.clrs32.i64(i64) nounwind + +declare <2 x i32> @llvm.riscv.v.clrs32.v2i32(<2 x i32>) nounwind + +declare i64 @llvm.riscv.clo8.i64(i64) nounwind + +declare <8 x i8> @llvm.riscv.v.clo8.v8i8(<8 x i8>) nounwind + +declare i64 @llvm.riscv.clo16.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.clo16.v4i16(<4 x i16>) nounwind + +declare i64 @llvm.riscv.clo32.i64(i64) nounwind + +declare <2 x i32> @llvm.riscv.v.clo32.v2i32(<2 x i32>) nounwind + +declare i64 @llvm.riscv.clz8.i64(i64) nounwind + +declare <8 x i8> @llvm.riscv.v.clz8.v8i8(<8 x i8>) nounwind + +declare i64 @llvm.riscv.clz16.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.clz16.v4i16(<4 x i16>) nounwind + +declare i64 @llvm.riscv.clz32.i64(i64) nounwind + +declare <2 x i32> @llvm.riscv.v.clz32.v2i32(<2 x i32>) nounwind + +declare i64 @llvm.riscv.cmpeq8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.cmpeq8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.cmpeq16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.cmpeq16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.cras16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.cras16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.crsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.crsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.insb.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.kabs8.i64(i64) nounwind + +declare <8 x i8> @llvm.riscv.v.kabs8.v8i8(<8 x i8>) nounwind + +declare i64 @llvm.riscv.kabs16.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kabs16.v4i16(<4 x i16>) nounwind + +declare i64 @llvm.riscv.kabsw.i64(i64) nounwind + +declare i64 @llvm.riscv.kadd8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.kadd8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.kadd16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kadd16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kaddh.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.kaddw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.kcras16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kcras16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kcrsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kcrsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmbb.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmbb.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmbt.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmbt.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmtt.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmtt.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmabb.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmabb.i64.v4i16(i64, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmabt.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmabt.i64.v4i16(i64, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kdmatt.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.v.kdmatt.i64.v4i16(i64, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.khm8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.khm8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.khmx8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.khmx8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.khm16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.khm16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.khmx16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.khmx16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.khmbb.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.khmbb.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.khmbt.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.khmbt.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.khmtt.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.khmtt.i64.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmabb.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmabb.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmabt.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmabt.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmatt.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmatt.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmada.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmada.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmaxda.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmaxda.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmads.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmads.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmadrs.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmadrs.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmaxds.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmaxds.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmda.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmda.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmxda.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmxda.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmac.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmac.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.kmmac.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmac.u.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.kmmawb.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawb.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawb.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawb.u.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawb2.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawb2.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawb2.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawb2.u.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawt.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawt.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawt.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawt.u.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawt2.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawt2.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmawt2.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmawt2.u.v2i32.v4i16(<2 x i32>, <2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmsb.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmsb.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.kmmsb.u.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmsb.u.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.kmmwb2.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmwb2.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmwb2.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmwb2.u.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmwt2.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmwt2.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmmwt2.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmmwt2.u.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmsda.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmsda.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kmsxda.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kmsxda.v2i32.v4i16(<2 x i32>, <4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ksllw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.ksll8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ksll8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.ksll16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ksll16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.kslra8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.kslra8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.kslra8.u.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.kslra8.u.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.kslra16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kslra16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.kslra16.u.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kslra16.u.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.kstas16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kstas16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.kstsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.kstsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ksub8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ksub8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.ksub16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ksub16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ksubh.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.ksubw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.kwmmul.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kwmmul.v2i32(<2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.kwmmul.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.kwmmul.u.v2i32(<2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.maxw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.minw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.pbsad.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.v.pbsad.i64.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.pbsada.i64(i64, i64, i64) nounwind + +declare i64 @llvm.riscv.v.pbsada.i64.v8i8(i64, <8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.pkbb16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.pkbb16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.pkbt16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.pkbt16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.pktt16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.pktt16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.pktb16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.pktb16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.radd8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.radd8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.radd16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.radd16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.raddw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.rcras16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.rcras16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.rcrsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.rcrsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.rstas16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.rstas16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.rstsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.rstsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.rsub8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.rsub8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.rsub16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.rsub16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.rsubw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.sclip8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.sclip8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.sclip16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sclip16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.sclip32.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.sclip32.v2i32.i64(<2 x i32>, i64) nounwind + +declare i64 @llvm.riscv.scmple8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.scmple8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.scmple16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.scmple16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.scmplt8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.scmplt8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.scmplt16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.scmplt16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.sll8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.sll8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.sll16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sll16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.smaqa.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smaqa.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.smaqa.su.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smaqa.su.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.smax8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.smax8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.smax16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.smax16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smbb16.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smbb16.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smbt16.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smbt16.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smtt16.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smtt16.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smds.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smds.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smdrs.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smdrs.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smxds.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smxds.v2i32.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smin8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.smin8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.smin16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.smin16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smmul.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmul.v2i32(<2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.smmul.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmul.u.v2i32(<2 x i32>, <2 x i32>) nounwind + +declare i64 @llvm.riscv.smmwb.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmwb.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smmwb.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmwb.u.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smmwt.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmwt.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.smmwt.u.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.smmwt.u.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.sra.u.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.sra8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.sra8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.sra8.u.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.sra8.u.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.sra16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sra16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.sra16.u.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sra16.u.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.srl8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.srl8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.srl8.u.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.srl8.u.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.srl16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.srl16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.srl16.u.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.srl16.u.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.stas16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.stas16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.stsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.stsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.sub8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.sub8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.sub16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sub16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.sunpkd810.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sunpkd810.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.sunpkd820.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sunpkd820.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.sunpkd830.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sunpkd830.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.sunpkd831.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sunpkd831.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.sunpkd832.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.sunpkd832.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.swap8.i64(i64) nounwind + +declare <8 x i8> @llvm.riscv.v.swap8.v8i8(<8 x i8>) nounwind + +declare i64 @llvm.riscv.swap16.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.swap16.v4i16(<4 x i16>) nounwind + +declare i64 @llvm.riscv.uclip8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.uclip8.v8i8.i64(<8 x i8>, i64) nounwind + +declare i64 @llvm.riscv.uclip16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.uclip16.v4i16.i64(<4 x i16>, i64) nounwind + +declare i64 @llvm.riscv.uclip32.i64(i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.uclip32.v2i32.i64(<2 x i32>, i64) nounwind + +declare i64 @llvm.riscv.ucmple8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ucmple8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.ucmple16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ucmple16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ucmplt8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ucmplt8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.ucmplt16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ucmplt16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ukadd8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ukadd8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.ukadd16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ukadd16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ukaddh.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.ukaddw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.ukcras16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ukcras16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ukcrsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ukcrsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ukstas16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ukstas16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ukstsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ukstsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.uksub8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.uksub8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.uksub16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.uksub16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.uksubh.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.uksubw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.umaqa.i64(i64, i64, i64) nounwind + +declare <2 x i32> @llvm.riscv.v.umaqa.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.umax8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.umax8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.umax16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.umax16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.umin8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.umin8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.umin16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.umin16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.uradd8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.uradd8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.uradd16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.uradd16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.uraddw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.urcras16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.urcras16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.urcrsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.urcrsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.urstas16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.urstas16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.urstsa16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.urstsa16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ursub8.i64(i64, i64) nounwind + +declare <8 x i8> @llvm.riscv.v.ursub8.v8i8(<8 x i8>, <8 x i8>) nounwind + +declare i64 @llvm.riscv.ursub16.i64(i64, i64) nounwind + +declare <4 x i16> @llvm.riscv.v.ursub16.v4i16(<4 x i16>, <4 x i16>) nounwind + +declare i64 @llvm.riscv.ursubw.i64(i64, i64) nounwind + +declare i64 @llvm.riscv.zunpkd810.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.zunpkd810.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.zunpkd820.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.zunpkd820.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.zunpkd830.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.zunpkd830.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.zunpkd831.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.zunpkd831.v4i16(<8 x i8>) nounwind + +declare i64 @llvm.riscv.zunpkd832.i64(i64) nounwind + +declare <4 x i16> @llvm.riscv.v.zunpkd832.v4i16(<8 x i8>) nounwind