diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -951,6 +951,25 @@ ], [IntrReadMem, IntrArgMemOnly]>; + class SVE2_3VectorArg_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMSubdivide2VectorType<0>, + LLVMSubdivide2VectorType<0>], + [IntrNoMem]>; + + class SVE2_3VectorArgIndexed_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMSubdivide2VectorType<0>, + LLVMSubdivide2VectorType<0>, + llvm_i32_ty], + [IntrNoMem]>; + + // NOTE: There is no relationship between these intrinsics beyond an attempt + // to reuse currently identical class definitions. + class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; + // This class of intrinsics are not intended to be useful within LLVM IR but // are instead here to support some of the more regid parts of the ACLE. class Builtin_SVCVT @@ -1191,4 +1210,33 @@ // scalar + vector, 64 bit scaled offsets def int_aarch64_sve_ld1_gather_index : AdvSIMD_GatherLoad_64bitOffset_Intrinsic; + +// +// SVE2 - Non-widening pairwise arithmetic +// + +def int_aarch64_sve_faddp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_fmaxp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_fminp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic; + +// +// SVE2 - Floating-point widening multiply-accumulate +// + +def int_aarch64_sve_fmlalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_fmlalb_lane : SVE2_3VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_fmlalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_fmlalt_lane : SVE2_3VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_fmlslb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_fmlslb_lane : SVE2_3VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_fmlslt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_fmlslt_lane : SVE2_3VectorArgIndexed_Long_Intrinsic; + +// +// SVE2 - Floating-point integer binary logarithm +// + +def int_aarch64_sve_flogb : AdvSIMD_SVE_LOGB_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1427,7 +1427,7 @@ defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; // SVE2 floating-point base 2 logarithm as integer - defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; // SVE2 floating-point convert precision defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">; @@ -1436,23 +1436,23 @@ defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">; // SVE2 floating-point pairwise operations - defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">; - defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">; - defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">; - defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">; - defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">; + defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>; + defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>; + defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>; + defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>; + defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>; // SVE2 floating-point multiply-add long (indexed) - def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">; - def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">; - def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">; - def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">; + defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>; + defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>; + defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>; + defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>; // SVE2 floating-point multiply-add long - def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">; - def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">; - def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">; - def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">; + defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>; + defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>; + defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>; + defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>; // SVE2 bitwise ternary operations defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -310,6 +310,12 @@ : Pat<(vtd (op vt1:$Op1, vt2:$Op2, (vt3 ImmTy:$Op3))), (inst $Op1, $Op2, ImmTy:$Op3)>; +class SVE_4_Op_Imm_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), + (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; + def SVEDup0Undef : ComplexPattern; //===----------------------------------------------------------------------===// @@ -1695,10 +1701,14 @@ let ElementSize = zprty.ElementSize; } -multiclass sve2_fp_pairwise_pred opc, string asm> { +multiclass sve2_fp_pairwise_pred opc, string asm, SDPatternOperator op> { def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>; def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>; def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1707,7 +1717,7 @@ class sve2_fp_mla_long_by_indexed_elem opc, string asm> : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, - VectorIndexH:$iop), + VectorIndexH32b:$iop), asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { @@ -1731,6 +1741,12 @@ let ElementSize = ElementSizeNone; } +multiclass sve2_fp_mla_long_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def NAME : sve2_fp_mla_long_by_indexed_elem; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE2 Floating Point Widening Multiply-Add Group //===----------------------------------------------------------------------===// @@ -1757,6 +1773,11 @@ let ElementSize = ElementSizeNone; } +multiclass sve2_fp_mla_long opc, string asm, SDPatternOperator op> { + def NAME : sve2_fp_mla_long; + def : SVE_3_Op_Pat(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE Stack Allocation Group //===----------------------------------------------------------------------===// @@ -1871,10 +1892,14 @@ def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve2_fp_flogb { +multiclass sve2_fp_flogb { def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>; def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>; def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } multiclass sve2_fp_convert_down_odd_rounding { diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll @@ -0,0 +1,39 @@ +;RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -asm-verbose=0 < %s | FileCheck %s + +; +; FLOGB +; + +define @flogb_f16( %a, %pg, %b) { +; CHECK-LABEL: flogb_f16: +; CHECK: flogb z0.h, p0/m, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.flogb.nxv8f16( %a, + %pg, + %b) + ret %out +} + +define @flogb_f32( %a, %pg, %b) { +; CHECK-LABEL: flogb_f32: +; CHECK: flogb z0.s, p0/m, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.flogb.nxv4f32( %a, + %pg, + %b) + ret %out +} + +define @flogb_f64( %a, %pg, %b) { +; CHECK-LABEL: flogb_f64: +; CHECK: flogb z0.d, p0/m, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.flogb.nxv2f64( %a, + %pg, + %b) + ret %out +} + +declare @llvm.aarch64.sve.flogb.nxv8f16(, , ) +declare @llvm.aarch64.sve.flogb.nxv4f32(, , ) +declare @llvm.aarch64.sve.flogb.nxv2f64(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll @@ -0,0 +1,127 @@ +;RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; FMLALB (Vectors) +; + +define @fmlalb_h( %a, %b, %c) { +; CHECK-LABEL: fmlalb_h: +; CHECK: fmlalb z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlalb.nxv4f32( %a, + %b, + %c) + ret %out +} + +; +; FMLALB (Indexed) +; + +define @fmlalb_lane_h( %a, %b, %c) { +; CHECK-LABEL: fmlalb_lane_h: +; CHECK: fmlalb z0.s, z1.h, z2.h[0] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlalb.lane.nxv4f32( %a, + %b, + %c, + i32 0) + ret %out +} + +; +; FMLALT (Vectors) +; + +define @fmlalt_h( %a, %b, %c) { +; CHECK-LABEL: fmlalt_h: +; CHECK: fmlalt z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlalt.nxv4f32( %a, + %b, + %c) + ret %out +} + +; +; FMLALT (Indexed) +; + +define @fmlalt_lane_h( %a, %b, %c) { +; CHECK-LABEL: fmlalt_lane_h: +; CHECK: fmlalt z0.s, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlalt.lane.nxv4f32( %a, + %b, + %c, + i32 1) + ret %out +} + +; +; FMLSLB (Vectors) +; + +define @fmlslb_h( %a, %b, %c) { +; CHECK-LABEL: fmlslb_h: +; CHECK: fmlslb z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlslb.nxv4f32( %a, + %b, + %c) + ret %out +} + +; +; FMLSLB (Indexed) +; + +define @fmlslb_lane_h( %a, %b, %c) { +; CHECK-LABEL: fmlslb_lane_h: +; CHECK: fmlslb z0.s, z1.h, z2.h[2] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlslb.lane.nxv4f32( %a, + %b, + %c, + i32 2) + ret %out +} + +; +; FMLSLT (Vectors) +; + +define @fmlslt_h( %a, %b, %c) { +; CHECK-LABEL: fmlslt_h: +; CHECK: fmlslt z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlslt.nxv4f32( %a, + %b, + %c) + ret %out +} + +; +; FMLSLT (Indexed) +; + +define @fmlslt_lane_h( %a, %b, %c) { +; CHECK-LABEL: fmlslt_lane_h: +; CHECK: fmlslt z0.s, z1.h, z2.h[3] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmlslt.lane.nxv4f32( %a, + %b, + %c, + i32 3) + ret %out +} + +declare @llvm.aarch64.sve.fmlalb.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmlalb.lane.nxv4f32(, , , i32) +declare @llvm.aarch64.sve.fmlalt.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmlalt.lane.nxv4f32(, , , i32) + +declare @llvm.aarch64.sve.fmlslb.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmlslb.lane.nxv4f32(, , , i32) +declare @llvm.aarch64.sve.fmlslt.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmlslt.lane.nxv4f32(, , , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll @@ -0,0 +1,191 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; FADDP +; + +define @faddp_f16( %pg, %a, %b) { +; CHECK-LABEL: faddp_f16: +; CHECK: faddp z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.faddp.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @faddp_f32( %pg, %a, %b) { +; CHECK-LABEL: faddp_f32: +; CHECK: faddp z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.faddp.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @faddp_f64( %pg, %a, %b) { +; CHECK-LABEL: faddp_f64: +; CHECK: faddp z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.faddp.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMAXP +; + +define @fmaxp_f16( %pg, %a, %b) { +; CHECK-LABEL: fmaxp_f16: +; CHECK: fmaxp z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxp.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fmaxp_f32( %pg, %a, %b) { +; CHECK-LABEL: fmaxp_f32: +; CHECK: fmaxp z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxp.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fmaxp_f64( %pg, %a, %b) { +; CHECK-LABEL: fmaxp_f64: +; CHECK: fmaxp z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxp.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMAXNMP +; + +define @fmaxnmp_f16( %pg, %a, %b) { +; CHECK-LABEL: fmaxnmp_f16: +; CHECK: fmaxnmp z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxnmp.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fmaxnmp_f32( %pg, %a, %b) { +; CHECK-LABEL: fmaxnmp_f32: +; CHECK: fmaxnmp z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxnmp.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fmaxnmp_f64( %pg, %a, %b) { +; CHECK-LABEL: fmaxnmp_f64: +; CHECK: fmaxnmp z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmaxnmp.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMINP +; + +define @fminp_f16( %pg, %a, %b) { +; CHECK-LABEL: fminp_f16: +; CHECK: fminp z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminp.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fminp_f32( %pg, %a, %b) { +; CHECK-LABEL: fminp_f32: +; CHECK: fminp z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminp.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fminp_f64( %pg, %a, %b) { +; CHECK-LABEL: fminp_f64: +; CHECK: fminp z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminp.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMINNMP +; + +define @fminnmp_f16( %pg, %a, %b) { +; CHECK-LABEL: fminnmp_f16: +; CHECK: fminnmp z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminnmp.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fminnmp_f32( %pg, %a, %b) { +; CHECK-LABEL: fminnmp_f32: +; CHECK: fminnmp z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminnmp.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fminnmp_f64( %pg, %a, %b) { +; CHECK-LABEL: fminnmp_f64: +; CHECK: fminnmp z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fminnmp.nxv2f64( %pg, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.faddp.nxv8f16(, , ) +declare @llvm.aarch64.sve.faddp.nxv4f32(, , ) +declare @llvm.aarch64.sve.faddp.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmaxp.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmaxp.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmaxp.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmaxnmp.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmaxnmp.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmaxnmp.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fminp.nxv8f16(, , ) +declare @llvm.aarch64.sve.fminp.nxv4f32(, , ) +declare @llvm.aarch64.sve.fminp.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fminnmp.nxv8f16(, , ) +declare @llvm.aarch64.sve.fminnmp.nxv4f32(, , ) +declare @llvm.aarch64.sve.fminnmp.nxv2f64(, , )