diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13875,16 +13875,22 @@ // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. -static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, - SelectionDAG &DAG) { +static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, + SelectionDAG &DAG, + bool UnpredOp = false) { assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); SDValue Pg = N->getOperand(1); // ISD way to specify an all active predicate. - if (isAllActivePredicate(Pg)) - return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, - N->getOperand(2), N->getOperand(3)); + if (isAllActivePredicate(Pg)) { + if (UnpredOp) + return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2), + N->getOperand(3)); + else + return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, + N->getOperand(2), N->getOperand(3)); + } // FUTURE: SplatVector(true) return SDValue(); @@ -14001,6 +14007,36 @@ return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); case Intrinsic::aarch64_sve_fmul: return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); + case Intrinsic::aarch64_sve_add: + return convertMergedOpToPredOp(N, ISD::ADD, DAG, true); + case Intrinsic::aarch64_sve_sub: + return convertMergedOpToPredOp(N, ISD::SUB, DAG, true); + case Intrinsic::aarch64_sve_and: + return convertMergedOpToPredOp(N, ISD::AND, DAG, true); + case Intrinsic::aarch64_sve_eor: + return convertMergedOpToPredOp(N, ISD::XOR, DAG, true); + case Intrinsic::aarch64_sve_orr: + return convertMergedOpToPredOp(N, ISD::OR, DAG, true); + case Intrinsic::aarch64_sve_sqadd: + return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); + case Intrinsic::aarch64_sve_sqsub: + return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true); + case Intrinsic::aarch64_sve_uqadd: + return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true); + case Intrinsic::aarch64_sve_uqsub: + return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true); + case Intrinsic::aarch64_sve_sqadd_x: + return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_sve_sqsub_x: + return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_sve_uqadd_x: + return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_sve_uqsub_x: + return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -281,12 +281,12 @@ def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>; def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>; - defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>; - defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>; - defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; - defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; - defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; - defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; + defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; + defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>; defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>; defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>; @@ -311,13 +311,13 @@ defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; - defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>; + defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; - defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>; - defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>; - defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>; - defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>; + defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; + defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -1592,8 +1592,7 @@ let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_arit_0 opc, string asm, - SDPatternOperator op, SDPatternOperator int_op> { +multiclass sve_int_bin_cons_arit_0 opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>; @@ -1603,12 +1602,6 @@ def : SVE_2_Op_Pat(NAME # _H)>; def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _D)>; - - // Intrinsic version - def : SVE_2_Op_Pat(NAME # _B)>; - def : SVE_2_Op_Pat(NAME # _H)>; - def : SVE_2_Op_Pat(NAME # _S)>; - def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -3997,8 +3990,7 @@ let ElementSize = ElementSizeNone; } -multiclass sve_int_arith_imm0 opc, string asm, - SDPatternOperator op, SDPatternOperator int_op> { +multiclass sve_int_arith_imm0 opc, string asm, SDPatternOperator op> { def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; @@ -4008,12 +4000,6 @@ def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _H)>; def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _S)>; def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _D)>; - - // Intrinsic version - def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _D)>; } multiclass sve_int_arith_imm0_subr opc, string asm, SDPatternOperator op> { diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -1,6 +1,265 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; ADD + +define @add_i8( %a) { +; CHECK-LABEL: add_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.b, z0.b, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @add_i16( %a) { +; CHECK-LABEL: add_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @add_i16_out_of_range( %a) { +; CHECK-LABEL: add_i16_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @add_i32( %a) { +; CHECK-LABEL: add_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.s, z0.s, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @add_i32_out_of_range( %a) { +; CHECK-LABEL: add_i32_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @add_i64( %a) { +; CHECK-LABEL: add_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.d, z0.d, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 127, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @add_i64_out_of_range( %a) { +; CHECK-LABEL: add_i64_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 257, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; SUB + +define @sub_i8( %a) { +; CHECK-LABEL: sub_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.b, z0.b, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @sub_i16( %a) { +; CHECK-LABEL: sub_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.h, z0.h, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sub_i16_out_of_range( %a) { +; CHECK-LABEL: sub_i16_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sub_i32( %a) { +; CHECK-LABEL: sub_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sub_i32_out_of_range( %a) { +; CHECK-LABEL: sub_i32_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sub_i64( %a) { +; CHECK-LABEL: sub_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 127, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @sub_i64_out_of_range( %a) { +; CHECK-LABEL: sub_i64_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 257, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; As sub_i32 but where pg is i8 based and thus compatible for i32. +define @sub_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_b: +; CHECK: sub z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As sub_i32 but where pg is i16 based and thus compatible for i32. +define @sub_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_h: +; CHECK: sub z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As sub_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @sub_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: sub z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; SMAX define @smax_i8( %a) { @@ -1604,6 +1863,16 @@ ret %out } +declare @llvm.aarch64.sve.add.nxv16i8(, , ) +declare @llvm.aarch64.sve.add.nxv8i16(, , ) +declare @llvm.aarch64.sve.add.nxv4i32(, , ) +declare @llvm.aarch64.sve.add.nxv2i64(, , ) + +declare @llvm.aarch64.sve.sub.nxv16i8(, , ) +declare @llvm.aarch64.sve.sub.nxv8i16(, , ) +declare @llvm.aarch64.sve.sub.nxv4i32(, , ) +declare @llvm.aarch64.sve.sub.nxv2i64(, , ) + declare @llvm.aarch64.sve.sqadd.x.nxv16i8(, ) declare @llvm.aarch64.sve.sqadd.x.nxv8i16(, ) declare @llvm.aarch64.sve.sqadd.x.nxv4i32(, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll @@ -0,0 +1,237 @@ +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; +; AND +; + +define @and_i8( %a) #0 { +; CHECK-LABEL: and_i8: +; CHECK: and z0.b, z0.b, #0x7 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 7, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @and_i16( %a) #0 { +; CHECK-LABEL: and_i16: +; CHECK: and z0.h, z0.h, #0xf0 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 240, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @and_i32( %a) #0 { +; CHECK-LABEL: and_i32: +; CHECK: and z0.s, z0.s, #0xffff00 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 16776960, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @and_i64( %a) #0 { +; CHECK-LABEL: and_i64: +; CHECK: and z0.d, z0.d, #0xfffc000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 18445618173802708992, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; EOR +; + +define @eor_i8( %a) #0 { +; CHECK-LABEL: eor_i8: +; CHECK: eor z0.b, z0.b, #0xf +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 15, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @eor_i16( %a) #0 { +; CHECK-LABEL: eor_i16: +; CHECK: eor z0.h, z0.h, #0xfc07 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 64519, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @eor_i32( %a) #0 { +; CHECK-LABEL: eor_i32: +; CHECK: eor z0.s, z0.s, #0xffff00 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 16776960, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @eor_i64( %a) #0 { +; CHECK-LABEL: eor_i64: +; CHECK: eor z0.d, z0.d, #0x1000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 281474976710656, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; ORR +; + +define @orr_i8( %a) #0 { +; CHECK-LABEL: orr_i8: +; CHECK: orr z0.b, z0.b, #0x6 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 6, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @orr_i16( %a) #0 { +; CHECK-LABEL: orr_i16: +; CHECK: orr z0.h, z0.h, #0x8001 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 32769, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @orr_i32( %a) #0 { +; CHECK-LABEL: orr_i32: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 65535, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @orr_i64( %a) #0 { +; CHECK-LABEL: orr_i64: +; CHECK: orr z0.d, z0.d, #0x7ffc000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 9222246136947933184, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i8 based and thus compatible for i32. +define @orr_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_b: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i16 based and thus compatible for i32. +define @orr_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_h: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @orr_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_d: +; CHECK-DAG: mov [[IMM:w[0-9]+]], #65535 +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, [[IMM]] +; CHECK-DAG: orr z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.and.nxv16i8(, , ) +declare @llvm.aarch64.sve.and.nxv8i16(, , ) +declare @llvm.aarch64.sve.and.nxv4i32(, , ) +declare @llvm.aarch64.sve.and.nxv2i64(, , ) + +declare @llvm.aarch64.sve.eor.nxv16i8(, , ) +declare @llvm.aarch64.sve.eor.nxv8i16(, , ) +declare @llvm.aarch64.sve.eor.nxv4i32(, , ) +declare @llvm.aarch64.sve.eor.nxv2i64(, , ) + +declare @llvm.aarch64.sve.orr.nxv16i8(, , ) +declare @llvm.aarch64.sve.orr.nxv8i16(, , ) +declare @llvm.aarch64.sve.orr.nxv4i32(, , ) +declare @llvm.aarch64.sve.orr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll @@ -2,6 +2,145 @@ target triple = "aarch64-unknown-linux-gnu" +; +; ADD +; + +define @add_i8( %a, %b) #0 { +; CHECK-LABEL: add_i8: +; CHECK: add z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.add.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @add_i16( %a, %b) #0 { +; CHECK-LABEL: add_i16: +; CHECK: add z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.add.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @add_i32( %a, %b) #0 { +; CHECK-LABEL: add_i32: +; CHECK: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.add.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @add_i64( %a, %b) #0 { +; CHECK-LABEL: add_i64: +; CHECK: add z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.add.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; SUB +; + +define @sub_i8( %a, %b) #0 { +; CHECK-LABEL: sub_i8: +; CHECK: sub z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.sub.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @sub_i16( %a, %b) #0 { +; CHECK-LABEL: sub_i16: +; CHECK: sub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.sub.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @sub_i32( %a, %b) #0 { +; CHECK-LABEL: sub_i32: +; CHECK: sub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.sub.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @sub_i64( %a, %b) #0 { +; CHECK-LABEL: sub_i64: +; CHECK: sub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.sub.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As sub_i32 but where pg is i8 based and thus compatible for i32. +define @sub_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_b: +; CHECK: sub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As sub_i32 but where pg is i16 based and thus compatible for i32. +define @sub_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_h: +; CHECK: sub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As sub_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @sub_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: sub_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: sub z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.sub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; ; MUL ; @@ -189,6 +328,428 @@ ret %out } +; +; AND +; + +define @and_i8( %a, %b) #0 { +; CHECK-LABEL: and_i8: +; CHECK: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.and.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @and_i16( %a, %b) #0 { +; CHECK-LABEL: and_i16: +; CHECK: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.and.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @and_i32( %a, %b) #0 { +; CHECK-LABEL: and_i32: +; CHECK: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.and.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @and_i64( %a, %b) #0 { +; CHECK-LABEL: and_i64: +; CHECK: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.and.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; EOR +; + +define @eor_i8( %a, %b) #0 { +; CHECK-LABEL: eor_i8: +; CHECK: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.eor.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @eor_i16( %a, %b) #0 { +; CHECK-LABEL: eor_i16: +; CHECK: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.eor.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @eor_i32( %a, %b) #0 { +; CHECK-LABEL: eor_i32: +; CHECK: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.eor.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @eor_i64( %a, %b) #0 { +; CHECK-LABEL: eor_i64: +; CHECK: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.eor.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; ORR +; + +define @orr_i8( %a, %b) #0 { +; CHECK-LABEL: orr_i8: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.orr.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @orr_i16( %a, %b) #0 { +; CHECK-LABEL: orr_i16: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.orr.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @orr_i32( %a, %b) #0 { +; CHECK-LABEL: orr_i32: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.orr.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @orr_i64( %a, %b) #0 { +; CHECK-LABEL: orr_i64: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.orr.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i8 based and thus compatible for i32. +define @orr_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_b: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i16 based and thus compatible for i32. +define @orr_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_h: +; CHECK: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @orr_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: orr z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; SQADD +; + +define @sqadd_i8( %a, %b) #0 { +; CHECK-LABEL: sqadd_i8: +; CHECK: sqadd z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.sqadd.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @sqadd_i16( %a, %b) #0 { +; CHECK-LABEL: sqadd_i16: +; CHECK: sqadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.sqadd.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @sqadd_i32( %a, %b) #0 { +; CHECK-LABEL: sqadd_i32: +; CHECK: sqadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.sqadd.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @sqadd_i64( %a, %b) #0 { +; CHECK-LABEL: sqadd_i64: +; CHECK: sqadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.sqadd.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; SQSUB +; + +define @sqsub_i8( %a, %b) #0 { +; CHECK-LABEL: sqsub_i8: +; CHECK: sqsub z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.sqsub.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @sqsub_i16( %a, %b) #0 { +; CHECK-LABEL: sqsub_i16: +; CHECK: sqsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.sqsub.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @sqsub_i32( %a, %b) #0 { +; CHECK-LABEL: sqsub_i32: +; CHECK: sqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.sqsub.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @sqsub_i64( %a, %b) #0 { +; CHECK-LABEL: sqsub_i64: +; CHECK: sqsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.sqsub.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; UQADD +; + +define @uqadd_i8( %a, %b) #0 { +; CHECK-LABEL: uqadd_i8: +; CHECK: uqadd z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.uqadd.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @uqadd_i16( %a, %b) #0 { +; CHECK-LABEL: uqadd_i16: +; CHECK: uqadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.uqadd.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @uqadd_i32( %a, %b) #0 { +; CHECK-LABEL: uqadd_i32: +; CHECK: uqadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.uqadd.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @uqadd_i64( %a, %b) #0 { +; CHECK-LABEL: uqadd_i64: +; CHECK: uqadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.uqadd.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; UQSUB +; + +define @uqsub_i8( %a, %b) #0 { +; CHECK-LABEL: uqsub_i8: +; CHECK: uqsub z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.uqsub.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @uqsub_i16( %a, %b) #0 { +; CHECK-LABEL: uqsub_i16: +; CHECK: uqsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.uqsub.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @uqsub_i32( %a, %b) #0 { +; CHECK-LABEL: uqsub_i32: +; CHECK: uqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.uqsub.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @uqsub_i64( %a, %b) #0 { +; CHECK-LABEL: uqsub_i64: +; CHECK: uqsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.uqsub.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As uqsub_i32 but where pg is i8 based and thus compatible for i32. +define @uqsub_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_b: +; CHECK: uqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As uqsub_i32 but where pg is i16 based and thus compatible for i32. +define @uqsub_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_h: +; CHECK: uqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As uqsub_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @uqsub_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: uqsub z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; ; ASR (wide) ; @@ -454,6 +1015,16 @@ ret %out } +declare @llvm.aarch64.sve.add.nxv16i8(, , ) +declare @llvm.aarch64.sve.add.nxv8i16(, , ) +declare @llvm.aarch64.sve.add.nxv4i32(, , ) +declare @llvm.aarch64.sve.add.nxv2i64(, , ) + +declare @llvm.aarch64.sve.sub.nxv16i8(, , ) +declare @llvm.aarch64.sve.sub.nxv8i16(, , ) +declare @llvm.aarch64.sve.sub.nxv4i32(, , ) +declare @llvm.aarch64.sve.sub.nxv2i64(, , ) + declare @llvm.aarch64.sve.mul.nxv16i8(, , ) declare @llvm.aarch64.sve.mul.nxv8i16(, , ) declare @llvm.aarch64.sve.mul.nxv4i32(, , ) @@ -469,6 +1040,41 @@ declare @llvm.aarch64.sve.umulh.nxv4i32(, , ) declare @llvm.aarch64.sve.umulh.nxv2i64(, , ) +declare @llvm.aarch64.sve.and.nxv16i8(, , ) +declare @llvm.aarch64.sve.and.nxv8i16(, , ) +declare @llvm.aarch64.sve.and.nxv4i32(, , ) +declare @llvm.aarch64.sve.and.nxv2i64(, , ) + +declare @llvm.aarch64.sve.eor.nxv16i8(, , ) +declare @llvm.aarch64.sve.eor.nxv8i16(, , ) +declare @llvm.aarch64.sve.eor.nxv4i32(, , ) +declare @llvm.aarch64.sve.eor.nxv2i64(, , ) + +declare @llvm.aarch64.sve.orr.nxv16i8(, , ) +declare @llvm.aarch64.sve.orr.nxv8i16(, , ) +declare @llvm.aarch64.sve.orr.nxv4i32(, , ) +declare @llvm.aarch64.sve.orr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.sqadd.nxv16i8(, , ) +declare @llvm.aarch64.sve.sqadd.nxv8i16(, , ) +declare @llvm.aarch64.sve.sqadd.nxv4i32(, , ) +declare @llvm.aarch64.sve.sqadd.nxv2i64(, , ) + +declare @llvm.aarch64.sve.sqsub.nxv16i8(, , ) +declare @llvm.aarch64.sve.sqsub.nxv8i16(, , ) +declare @llvm.aarch64.sve.sqsub.nxv4i32(, , ) +declare @llvm.aarch64.sve.sqsub.nxv2i64(, , ) + +declare @llvm.aarch64.sve.uqadd.nxv16i8(, , ) +declare @llvm.aarch64.sve.uqadd.nxv8i16(, , ) +declare @llvm.aarch64.sve.uqadd.nxv4i32(, , ) +declare @llvm.aarch64.sve.uqadd.nxv2i64(, , ) + +declare @llvm.aarch64.sve.uqsub.nxv16i8(, , ) +declare @llvm.aarch64.sve.uqsub.nxv8i16(, , ) +declare @llvm.aarch64.sve.uqsub.nxv4i32(, , ) +declare @llvm.aarch64.sve.uqsub.nxv2i64(, , ) + declare @llvm.aarch64.sve.asr.wide.nxv16i8(, , ) declare @llvm.aarch64.sve.asr.wide.nxv8i16(, , ) declare @llvm.aarch64.sve.asr.wide.nxv4i32(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll @@ -0,0 +1,484 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; SQADD + +define @sqadd_b_lowimm( %a) { +; CHECK-LABEL: sqadd_b_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.b, z0.b, #27 // =0x1b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 27, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_h_lowimm( %a) { +; CHECK-LABEL: sqadd_h_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.h, z0.h, #43 // =0x2b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 43, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_h_highimm( %a) { +; CHECK-LABEL: sqadd_h_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.h, z0.h, #2048 // =0x800 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 2048, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_s_lowimm( %a) { +; CHECK-LABEL: sqadd_s_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 1, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_s_highimm( %a) { +; CHECK-LABEL: sqadd_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_d_lowimm( %a) { +; CHECK-LABEL: sqadd_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @sqadd_d_highimm( %a) { +; CHECK-LABEL: sqadd_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqadd.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; SQSUB + +define @sqsub_b_lowimm( %a) { +; CHECK-LABEL: sqsub_b_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.b, z0.b, #27 // =0x1b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 27, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_h_lowimm( %a) { +; CHECK-LABEL: sqsub_h_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.h, z0.h, #43 // =0x2b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 43, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_h_highimm( %a) { +; CHECK-LABEL: sqsub_h_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.h, z0.h, #2048 // =0x800 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 2048, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_s_lowimm( %a) { +; CHECK-LABEL: sqsub_s_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 1, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_s_highimm( %a) { +; CHECK-LABEL: sqsub_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_d_lowimm( %a) { +; CHECK-LABEL: sqsub_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @sqsub_d_highimm( %a) { +; CHECK-LABEL: sqsub_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.sqsub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; UQADD + +define @uqadd_b_lowimm( %a) { +; CHECK-LABEL: uqadd_b_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.b, z0.b, #27 // =0x1b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 27, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_h_lowimm( %a) { +; CHECK-LABEL: uqadd_h_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.h, z0.h, #43 // =0x2b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 43, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_h_highimm( %a) { +; CHECK-LABEL: uqadd_h_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.h, z0.h, #2048 // =0x800 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 2048, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_s_lowimm( %a) { +; CHECK-LABEL: uqadd_s_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 1, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_s_highimm( %a) { +; CHECK-LABEL: uqadd_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_d_lowimm( %a) { +; CHECK-LABEL: uqadd_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @uqadd_d_highimm( %a) { +; CHECK-LABEL: uqadd_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; UQSUB + +define @uqsub_b_lowimm( %a) { +; CHECK-LABEL: uqsub_b_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.b, z0.b, #27 // =0x1b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 27, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_h_lowimm( %a) { +; CHECK-LABEL: uqsub_h_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.h, z0.h, #43 // =0x2b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 43, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_h_highimm( %a) { +; CHECK-LABEL: uqsub_h_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.h, z0.h, #2048 // =0x800 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 2048, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_s_lowimm( %a) { +; CHECK-LABEL: uqsub_s_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 1, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_s_highimm( %a) { +; CHECK-LABEL: uqsub_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_d_lowimm( %a) { +; CHECK-LABEL: uqsub_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @uqsub_d_highimm( %a) { +; CHECK-LABEL: uqsub_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqsub.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; As uqsub_i32 but where pg is i8 based and thus compatible for i32. +define @uqsub_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_b: +; CHECK: uqsub z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As uqsub_i32 but where pg is i16 based and thus compatible for i32. +define @uqsub_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_h: +; CHECK: uqsub z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As uqsub_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @uqsub_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: uqsub_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: uqsub z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.uqsub.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.sqadd.nxv16i8(, , ) +declare @llvm.aarch64.sve.sqadd.nxv8i16(, , ) +declare @llvm.aarch64.sve.sqadd.nxv4i32(, , ) +declare @llvm.aarch64.sve.sqadd.nxv2i64(, , ) + +declare @llvm.aarch64.sve.sqsub.nxv16i8(, , ) +declare @llvm.aarch64.sve.sqsub.nxv8i16(, , ) +declare @llvm.aarch64.sve.sqsub.nxv4i32(, , ) +declare @llvm.aarch64.sve.sqsub.nxv2i64(, , ) + +declare @llvm.aarch64.sve.uqadd.nxv16i8(, , ) +declare @llvm.aarch64.sve.uqadd.nxv8i16(, , ) +declare @llvm.aarch64.sve.uqadd.nxv4i32(, , ) +declare @llvm.aarch64.sve.uqadd.nxv2i64(, , ) + +declare @llvm.aarch64.sve.uqsub.nxv16i8(, , ) +declare @llvm.aarch64.sve.uqsub.nxv8i16(, , ) +declare @llvm.aarch64.sve.uqsub.nxv4i32(, , ) +declare @llvm.aarch64.sve.uqsub.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 %pattern)