diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -335,6 +335,8 @@ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + + bool SelectAllActivePredicate(SDValue N); }; } // end anonymous namespace @@ -4983,3 +4985,10 @@ return false; } + +bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { + const AArch64TargetLowering *TLI = + static_cast<const AArch64TargetLowering *>(getTargetLowering()); + + return TLI->isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -820,6 +820,8 @@ return 128; } + bool isAllActivePredicate(SDValue N) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13849,6 +13849,29 @@ Zero); } +static bool isAllActivePredicate(SDValue N) { + unsigned NumElts = N.getValueType().getVectorMinNumElements(); + + // Look through cast. + SDValue &Op = N; + while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST) { + Op = Op.getOperand(0); + // When reinterpreting from a type with fewer elements the "new" elements + // are not active, so bail if they're likely to be used. + if (Op.getValueType().getVectorMinNumElements() < NumElts) + return false; + } + + // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size + // or smaller than the implicit element type represented by N. + // NOTE: A larger element count implies a smaller element type. + if (Op.getOpcode() == AArch64ISD::PTRUE && + Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all) + return Op.getValueType().getVectorMinNumElements() >= NumElts; + + return false; +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -13859,8 +13882,7 @@ SDValue Pg = N->getOperand(1); // ISD way to specify an all active predicate. - if ((Pg.getOpcode() == AArch64ISD::PTRUE) && - (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + if (isAllActivePredicate(Pg)) return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, N->getOperand(2), N->getOperand(3)); @@ -13953,6 +13975,12 @@ N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_mul: + return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); + case Intrinsic::aarch64_sve_smulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG); + case Intrinsic::aarch64_sve_umulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG); case Intrinsic::aarch64_sve_smin: return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: @@ -13967,6 +13995,12 @@ return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); + case Intrinsic::aarch64_sve_fadd: + return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); + case Intrinsic::aarch64_sve_fsub: + return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); + case Intrinsic::aarch64_sve_fmul: + return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -17654,3 +17688,7 @@ return Op; } + +bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const { + return ::isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1380,9 +1380,9 @@ defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; - defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; - defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; - defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; + defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>; // Predicated shifts defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; @@ -2412,24 +2412,6 @@ defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>; defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; - // Add patterns for unpredicated version of smulh and umulh. - def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (SMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (SMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (SMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (SMULH_ZZZ_D $Op1, $Op2)>; - def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (UMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (UMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (UMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (UMULH_ZZZ_D $Op1, $Op2)>; - // SVE2 complex integer dot product (indexed) defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -221,6 +221,8 @@ def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>; def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>; +def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>; + class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -339,9 +341,9 @@ : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))), (inst $Op1, i32:$imm, i32:$shift)>; -class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op, +class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> - : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, @@ -357,7 +359,7 @@ class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op, ValueType pt, ValueType vt1, ValueType vt2, Instruction inst> -: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)), +: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)), (inst $Op1, $Op2)>; class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, @@ -432,7 +434,7 @@ class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op, ValueType pt, ValueType it, ComplexPattern cast, Instruction inst> -: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), +: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), (inst $Rn, i32:$imm)>; // @@ -4052,10 +4054,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> { @@ -4064,10 +4066,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> { @@ -4076,10 +4078,10 @@ def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -5183,10 +5185,14 @@ let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> { +multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>; + + def : SVE_2_Op_Pred_All_Active<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>; } class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -110,6 +110,52 @@ ret <vscale x 2 x i64> %out } +; As smax_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @smax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_b: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As smax_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_h: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @smax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} ; SMIN @@ -220,6 +266,53 @@ ret <vscale x 2 x i64> %out } +; As smin_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @smin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_b: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As smin_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_h: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @smin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + ; UMAX define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a) { @@ -329,6 +422,53 @@ ret <vscale x 2 x i64> %out } +; As umax_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @umax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_b: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umax_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_h: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @umax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + ; UMIN define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a) { @@ -438,6 +578,53 @@ ret <vscale x 2 x i64> %out } +; As umin_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @umin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_b: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umin_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_h: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @umin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + ; SQADD define <vscale x 16 x i8> @sqadd_b_lowimm(<vscale x 16 x i8> %a) { @@ -660,6 +847,42 @@ ret <vscale x 4 x i32> %out } +define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) { +; CHECK-LABEL: uqadd_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0 + %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %splat) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) { +; CHECK-LABEL: uqadd_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0 + %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 2 x i64> %splat) + ret <vscale x 2 x i64> %out +} + +define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) { +; CHECK-LABEL: uqadd_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0 + %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 2 x i64> %splat) + ret <vscale x 2 x i64> %out +} + ; UQSUB define <vscale x 16 x i8> @uqsub_b_lowimm(<vscale x 16 x i8> %a) { @@ -746,43 +969,6 @@ ret <vscale x 2 x i64> %out } - -define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) { -; CHECK-LABEL: uqadd_s_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 -; CHECK-NEXT: ret - %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0 - %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer - %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a, - <vscale x 4 x i32> %splat) - ret <vscale x 4 x i32> %out -} - -define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqadd_d_lowimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff -; CHECK-NEXT: ret - %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0 - %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer - %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a, - <vscale x 2 x i64> %splat) - ret <vscale x 2 x i64> %out -} - -define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqadd_d_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 -; CHECK-NEXT: ret - %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0 - %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer - %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a, - <vscale x 2 x i64> %splat) - ret <vscale x 2 x i64> %out -} - ; ASR define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) { @@ -1321,6 +1507,103 @@ ret <vscale x 2 x i64> %out } +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1 +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; +; MUL +; + +; As mul_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @mul_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_b: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As mul_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_h: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @mul_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + declare <vscale x 16 x i8> @llvm.aarch64.sve.sqadd.x.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) @@ -1376,6 +1659,21 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) + +declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>) +declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>) +declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>) + +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>) + +declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32) + declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll @@ -0,0 +1,509 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; MUL +; + +define <vscale x 16 x i8> @mul_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 { +; CHECK-LABEL: mul_i8: +; CHECK: mul z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 16 x i8> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @mul_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 { +; CHECK-LABEL: mul_i16: +; CHECK: mul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 8 x i16> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @mul_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: mul_i32: +; CHECK: mul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @mul_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: mul_i64: +; CHECK: mul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, + <vscale x 2 x i64> %a, + <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %out +} + +; +; SMULH +; + +define <vscale x 16 x i8> @smulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 { +; CHECK-LABEL: smulh_i8: +; CHECK: smulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 16 x i8> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @smulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 { +; CHECK-LABEL: smulh_i16: +; CHECK: smulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 8 x i16> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @smulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: smulh_i32: +; CHECK: smulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @smulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: smulh_i64: +; CHECK: smulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1> %pg, + <vscale x 2 x i64> %a, + <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %out +} + +; +; UMULH +; + +define <vscale x 16 x i8> @umulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 { +; CHECK-LABEL: umulh_i8: +; CHECK: umulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 16 x i8> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @umulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 { +; CHECK-LABEL: umulh_i16: +; CHECK: umulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 8 x i16> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @umulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: umulh_i32: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: umulh_i64: +; CHECK: umulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1> %pg, + <vscale x 2 x i64> %a, + <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %out +} + +; As umulh_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @umulh_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_b: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umulh_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @umulh_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_h: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; As umulh_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @umulh_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: umulh z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 4 x i32> %b) + ret <vscale x 4 x i32> %out +} + +; +; ASR (wide) +; + +define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: asr_i8: +; CHECK: asr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 2 x i64> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: asr_i16: +; CHECK: asr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 2 x i64> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: asr_i32: +; CHECK: asr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; +; LSL (wide) +; + +define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsl_i8: +; CHECK: lsl z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 2 x i64> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsl_i16: +; CHECK: lsl z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 2 x i64> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsl_i32: +; CHECK: lsl z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; +; LSR (wide) +; + +define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i8: +; CHECK: lsr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1> %pg, + <vscale x 16 x i8> %a, + <vscale x 2 x i64> %b) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i16: +; CHECK: lsr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1> %pg, + <vscale x 8 x i16> %a, + <vscale x 2 x i64> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i32: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, z1.d +; CHECK-NEXT: ret + %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d) + %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b) + %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s, + <vscale x 4 x i32> %a, + <vscale x 2 x i64> %b) + ret <vscale x 4 x i32> %out +} + +; +; FADD +; + +define <vscale x 8 x half> @fadd_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 { +; CHECK-LABEL: fadd_half: +; CHECK: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %pg, + <vscale x 8 x half> %a, + <vscale x 8 x half> %b) + ret <vscale x 8 x half> %out +} + +define <vscale x 4 x float> @fadd_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 { +; CHECK-LABEL: fadd_float: +; CHECK: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %pg, + <vscale x 4 x float> %a, + <vscale x 4 x float> %b) + ret <vscale x 4 x float> %out +} + +define <vscale x 2 x double> @fadd_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 { +; CHECK-LABEL: fadd_double: +; CHECK: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %pg, + <vscale x 2 x double> %a, + <vscale x 2 x double> %b) + ret <vscale x 2 x double> %out +} + +; +; FSUB +; + +define <vscale x 8 x half> @fsub_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 { +; CHECK-LABEL: fsub_half: +; CHECK: fsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %pg, + <vscale x 8 x half> %a, + <vscale x 8 x half> %b) + ret <vscale x 8 x half> %out +} + +define <vscale x 4 x float> @fsub_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 { +; CHECK-LABEL: fsub_float: +; CHECK: fsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %pg, + <vscale x 4 x float> %a, + <vscale x 4 x float> %b) + ret <vscale x 4 x float> %out +} + +define <vscale x 2 x double> @fsub_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 { +; CHECK-LABEL: fsub_double: +; CHECK: fsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %pg, + <vscale x 2 x double> %a, + <vscale x 2 x double> %b) + ret <vscale x 2 x double> %out +} + +; +; FMUL +; + +define <vscale x 8 x half> @fmul_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 { +; CHECK-LABEL: fmul_half: +; CHECK: fmul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, + <vscale x 8 x half> %a, + <vscale x 8 x half> %b) + ret <vscale x 8 x half> %out +} + +define <vscale x 4 x float> @fmul_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 { +; CHECK-LABEL: fmul_float: +; CHECK: fmul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, + <vscale x 4 x float> %a, + <vscale x 4 x float> %b) + ret <vscale x 4 x float> %out +} + +define <vscale x 2 x double> @fmul_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 { +; CHECK-LABEL: fmul_double: +; CHECK: fmul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, + <vscale x 2 x double> %a, + <vscale x 2 x double> %b) + ret <vscale x 2 x double> %out +} + +declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) + +declare <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) + +declare <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) + +declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>) + +declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>) + +declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>) + +declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) +declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) +declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) + +declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) +declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) +declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) + +declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) +declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) +declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) + +declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>) +declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>) +declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>) + +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>) + +declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +attributes #0 = { "target-features"="+sve2" }