diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -335,6 +335,8 @@ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + + bool SelectAllActivePredicate(SDValue N); }; } // end anonymous namespace @@ -4983,3 +4985,9 @@ return false; } + +bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { + const AArch64TargetLowering *TLI = static_cast(getTargetLowering()); + + return TLI->isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -820,6 +820,8 @@ return 128; } + bool isAllActivePredicate(SDValue N) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13849,6 +13849,29 @@ Zero); } +static bool isAllActivePredicate(SDValue N) { + unsigned NumElts = N.getValueType().getVectorMinNumElements(); + + // Look through cast. + SDValue &Op = N; + while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST) { + Op = Op.getOperand(0); + // When reinterpreting from a type with fewer elements the "new" elements + // are not active, so bail if they're likely to be used. + if (Op.getValueType().getVectorMinNumElements() < NumElts) + return false; + } + + // "ptrue p., all" can be considered all active when is the same size + // or smaller than the implicit element type represented by N. + // NOTE: A larger element count implies a smaller element type. + if (Op.getOpcode() == AArch64ISD::PTRUE && + Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all) + return Op.getValueType().getVectorMinNumElements() >= NumElts; + + return false; +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -13859,8 +13882,7 @@ SDValue Pg = N->getOperand(1); // ISD way to specify an all active predicate. - if ((Pg.getOpcode() == AArch64ISD::PTRUE) && - (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + if (isAllActivePredicate(Pg)) return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, N->getOperand(2), N->getOperand(3)); @@ -13953,6 +13975,12 @@ N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_mul: + return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); + case Intrinsic::aarch64_sve_smulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG); + case Intrinsic::aarch64_sve_umulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG); case Intrinsic::aarch64_sve_smin: return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: @@ -13967,6 +13995,12 @@ return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); + case Intrinsic::aarch64_sve_fadd: + return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); + case Intrinsic::aarch64_sve_fsub: + return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); + case Intrinsic::aarch64_sve_fmul: + return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -17654,3 +17688,7 @@ return Op; } + +bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const { + return ::isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1380,9 +1380,9 @@ defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; - defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; - defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; - defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; + defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>; // Predicated shifts defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; @@ -2412,24 +2412,6 @@ defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>; defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; - // Add patterns for unpredicated version of smulh and umulh. - def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (SMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (SMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (SMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (SMULH_ZZZ_D $Op1, $Op2)>; - def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (UMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (UMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (UMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (UMULH_ZZZ_D $Op1, $Op2)>; - // SVE2 complex integer dot product (indexed) defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -221,6 +221,8 @@ def SVEShiftImmR32 : ComplexPattern", []>; def SVEShiftImmR64 : ComplexPattern", []>; +def SVEAllActive : ComplexPattern; + class SVEExactFPImm : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -339,9 +341,9 @@ : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))), (inst $Op1, i32:$imm, i32:$shift)>; -class SVE_1_Op_Imm_Arith_Pred_Pat - : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; class SVE_1_Op_Imm_Log_Pat -: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)), +: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)), (inst $Op1, $Op2)>; class SVE_3_Op_Pat -: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), +: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), (inst $Rn, i32:$imm)>; // @@ -4052,10 +4054,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperator op> { @@ -4064,10 +4066,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm2 { @@ -4076,10 +4078,10 @@ def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -5183,10 +5185,14 @@ let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_wide opc, string asm> { +multiclass sve_int_bin_cons_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>; + + def : SVE_2_Op_Pred_All_Active(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active(NAME # _S)>; } class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -110,6 +110,52 @@ ret %out } +; As smax_i32 but where pg is i8 based and thus compatible for i32. +define @smax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_b: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i16 based and thus compatible for i32. +define @smax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_h: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} ; SMIN @@ -220,6 +266,53 @@ ret %out } +; As smin_i32 but where pg is i8 based and thus compatible for i32. +define @smin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_b: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i16 based and thus compatible for i32. +define @smin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_h: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMAX define @umax_i8( %a) { @@ -329,6 +422,53 @@ ret %out } +; As umax_i32 but where pg is i8 based and thus compatible for i32. +define @umax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_b: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i16 based and thus compatible for i32. +define @umax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_h: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMIN define @umin_i8( %a) { @@ -438,6 +578,53 @@ ret %out } +; As umin_i32 but where pg is i8 based and thus compatible for i32. +define @umin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_b: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i16 based and thus compatible for i32. +define @umin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_h: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; SQADD define @sqadd_b_lowimm( %a) { @@ -660,6 +847,42 @@ ret %out } +define @uqadd_s_highimm( %a) { +; CHECK-LABEL: uqadd_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv4i32( %a, + %splat) + ret %out +} + +define @uqadd_d_lowimm( %a) { +; CHECK-LABEL: uqadd_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, + %splat) + ret %out +} + +define @uqadd_d_highimm( %a) { +; CHECK-LABEL: uqadd_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, + %splat) + ret %out +} + ; UQSUB define @uqsub_b_lowimm( %a) { @@ -746,43 +969,6 @@ ret %out } - -define @uqadd_s_highimm( %a) { -; CHECK-LABEL: uqadd_s_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 -; CHECK-NEXT: ret - %elt = insertelement undef, i32 8192, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv4i32( %a, - %splat) - ret %out -} - -define @uqadd_d_lowimm( %a) { -; CHECK-LABEL: uqadd_d_lowimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff -; CHECK-NEXT: ret - %elt = insertelement undef, i64 255, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, - %splat) - ret %out -} - -define @uqadd_d_highimm( %a) { -; CHECK-LABEL: uqadd_d_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 -; CHECK-NEXT: ret - %elt = insertelement undef, i64 65280, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, - %splat) - ret %out -} - ; ASR define @asr_i8( %pg, %a) { @@ -1321,6 +1507,103 @@ ret %out } +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define @lsr_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define @lsr_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @lsr_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1 +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; MUL +; + +; As mul_i32 but where pg is i8 based and thus compatible for i32. +define @mul_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_b: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i16 based and thus compatible for i32. +define @mul_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_h: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @mul_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + declare @llvm.aarch64.sve.sqadd.x.nxv16i8(, ) declare @llvm.aarch64.sve.sqadd.x.nxv8i16(, ) declare @llvm.aarch64.sve.sqadd.x.nxv4i32(, ) @@ -1376,6 +1659,21 @@ declare @llvm.aarch64.sve.lsr.nxv4i32(, , ) declare @llvm.aarch64.sve.lsr.nxv2i64(, , ) +declare @llvm.aarch64.sve.mul.nxv16i8(, , ) +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll @@ -0,0 +1,509 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; MUL +; + +define @mul_i8( %a, %b) #0 { +; CHECK-LABEL: mul_i8: +; CHECK: mul z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @mul_i16( %a, %b) #0 { +; CHECK-LABEL: mul_i16: +; CHECK: mul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @mul_i32( %a, %b) #0 { +; CHECK-LABEL: mul_i32: +; CHECK: mul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @mul_i64( %a, %b) #0 { +; CHECK-LABEL: mul_i64: +; CHECK: mul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; SMULH +; + +define @smulh_i8( %a, %b) #0 { +; CHECK-LABEL: smulh_i8: +; CHECK: smulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @smulh_i16( %a, %b) #0 { +; CHECK-LABEL: smulh_i16: +; CHECK: smulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @smulh_i32( %a, %b) #0 { +; CHECK-LABEL: smulh_i32: +; CHECK: smulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @smulh_i64( %a, %b) #0 { +; CHECK-LABEL: smulh_i64: +; CHECK: smulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; UMULH +; + +define @umulh_i8( %a, %b) #0 { +; CHECK-LABEL: umulh_i8: +; CHECK: umulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @umulh_i16( %a, %b) #0 { +; CHECK-LABEL: umulh_i16: +; CHECK: umulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @umulh_i32( %a, %b) #0 { +; CHECK-LABEL: umulh_i32: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @umulh_i64( %a, %b) #0 { +; CHECK-LABEL: umulh_i64: +; CHECK: umulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i8 based and thus compatible for i32. +define @umulh_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_b: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i16 based and thus compatible for i32. +define @umulh_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_h: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umulh_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: umulh z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; ASR (wide) +; + +define @asr_i8( %a, %b) #0 { +; CHECK-LABEL: asr_i8: +; CHECK: asr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @asr_i16( %a, %b) #0 { +; CHECK-LABEL: asr_i16: +; CHECK: asr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @asr_i32( %a, %b) #0 { +; CHECK-LABEL: asr_i32: +; CHECK: asr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; +; LSL (wide) +; + +define @lsl_i8( %a, %b) #0 { +; CHECK-LABEL: lsl_i8: +; CHECK: lsl z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @lsl_i16( %a, %b) #0 { +; CHECK-LABEL: lsl_i16: +; CHECK: lsl z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @lsl_i32( %a, %b) #0 { +; CHECK-LABEL: lsl_i32: +; CHECK: lsl z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; +; LSR (wide) +; + +define @lsr_i8( %a, %b) #0 { +; CHECK-LABEL: lsr_i8: +; CHECK: lsr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @lsr_i16( %a, %b) #0 { +; CHECK-LABEL: lsr_i16: +; CHECK: lsr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @lsr_i32( %a, %b) #0 { +; CHECK-LABEL: lsr_i32: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define @lsr_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define @lsr_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @lsr_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, z1.d +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; FADD +; + +define @fadd_half( %a, %b) #0 { +; CHECK-LABEL: fadd_half: +; CHECK: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fadd_float( %a, %b) #0 { +; CHECK-LABEL: fadd_float: +; CHECK: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fadd_double( %a, %b) #0 { +; CHECK-LABEL: fadd_double: +; CHECK: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FSUB +; + +define @fsub_half( %a, %b) #0 { +; CHECK-LABEL: fsub_half: +; CHECK: fsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fsub_float( %a, %b) #0 { +; CHECK-LABEL: fsub_float: +; CHECK: fsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fsub_double( %a, %b) #0 { +; CHECK-LABEL: fsub_double: +; CHECK: fsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMUL +; + +define @fmul_half( %a, %b) #0 { +; CHECK-LABEL: fmul_half: +; CHECK: fmul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fmul_float( %a, %b) #0 { +; CHECK-LABEL: fmul_float: +; CHECK: fmul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fmul_double( %a, %b) #0 { +; CHECK-LABEL: fmul_double: +; CHECK: fmul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.mul.nxv16i8(, , ) +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +declare @llvm.aarch64.sve.smulh.nxv16i8(, , ) +declare @llvm.aarch64.sve.smulh.nxv8i16(, , ) +declare @llvm.aarch64.sve.smulh.nxv4i32(, , ) +declare @llvm.aarch64.sve.smulh.nxv2i64(, , ) + +declare @llvm.aarch64.sve.umulh.nxv16i8(, , ) +declare @llvm.aarch64.sve.umulh.nxv8i16(, , ) +declare @llvm.aarch64.sve.umulh.nxv4i32(, , ) +declare @llvm.aarch64.sve.umulh.nxv2i64(, , ) + +declare @llvm.aarch64.sve.asr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsl.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.fadd.nxv8f16(, , ) +declare @llvm.aarch64.sve.fadd.nxv4f32(, , ) +declare @llvm.aarch64.sve.fadd.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fsub.nxv8f16(, , ) +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsub.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.nxv2f64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +attributes #0 = { "target-features"="+sve2" }