diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -335,6 +335,8 @@ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + + bool SelectAllActivePredicate(SDValue N); }; } // end anonymous namespace @@ -4983,3 +4985,26 @@ return false; } + +bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { + unsigned NumElts = N.getValueType().getVectorMinNumElements(); + + // Look through cast. + SDValue &Op = N; + while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST) { + Op = Op.getOperand(0); + // When reinterpreting from a type with fewer elements the "new" elements + // are not active, so bail if they're likely to be used. + if (Op.getValueType().getVectorMinNumElements() < NumElts) + return false; + } + + // "ptrue p., all" can be considered all active when is the same size + // or smaller than the implicit element type represented by N. + // NOTE: A larger element count implies a smaller element type. + if (Op.getOpcode() == AArch64ISD::PTRUE && + Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all) + return Op.getValueType().getVectorMinNumElements() >= NumElts; + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13861,6 +13861,30 @@ Zero); } +static bool isAllActivePredicate(SDValue N) { + // Matches AArch64DAGToDAGISel::SelectAllActivePredicate(). + unsigned NumElts = N.getValueType().getVectorMinNumElements(); + + // Look through cast. + SDValue &Op = N; + while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST) { + Op = Op.getOperand(0); + // When reinterpreting from a type with fewer elements the "new" elements + // are not active, so bail if they're likely to be used. + if (Op.getValueType().getVectorMinNumElements() < NumElts) + return false; + } + + // "ptrue p., all" can be considered all active when is the same size + // or smaller than the implicit element type represented by N. + // NOTE: A larger element count implies a smaller element type. + if (Op.getOpcode() == AArch64ISD::PTRUE && + Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all) + return Op.getValueType().getVectorMinNumElements() >= NumElts; + + return false; +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -13871,8 +13895,7 @@ SDValue Pg = N->getOperand(1); // ISD way to specify an all active predicate. - if ((Pg.getOpcode() == AArch64ISD::PTRUE) && - (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + if (isAllActivePredicate(Pg)) return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, N->getOperand(2), N->getOperand(3)); @@ -13965,6 +13988,8 @@ N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_mul: + return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); case Intrinsic::aarch64_sve_smin: return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -335,9 +335,9 @@ defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>; defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>; - defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>; - defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; - defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; + defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or, int_aarch64_sve_orr>; + defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor, int_aarch64_sve_eor>; + defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and, int_aarch64_sve_and>; defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>; defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -221,6 +221,8 @@ def SVEShiftImmR32 : ComplexPattern", []>; def SVEShiftImmR64 : ComplexPattern", []>; +def SVEAllActive : ComplexPattern; + class SVEExactFPImm : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -339,9 +341,9 @@ : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))), (inst $Op1, i32:$imm, i32:$shift)>; -class SVE_1_Op_Imm_Arith_Pred_Pat - : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; class SVE_1_Op_Imm_Log_Pat; +class SVE_1_Op_Imm_Log_All_Active + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))), + (inst $Op1, i64:$imm)>; + class SVE_2_Op_Pat : Pat<(vtd (op vt1:$Op1, vt2:$Op2)), @@ -1494,13 +1501,19 @@ let ElementSize = ElementSizeNone; } -multiclass sve_int_log_imm opc, string asm, string alias, SDPatternOperator op> { +multiclass sve_int_log_imm opc, string asm, string alias, + SDPatternOperator ir_op, SDPatternOperator int_op> { def NAME : sve_int_log_imm; - def : SVE_1_Op_Imm_Log_Pat(NAME)>; - def : SVE_1_Op_Imm_Log_Pat(NAME)>; - def : SVE_1_Op_Imm_Log_Pat(NAME)>; - def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + + def : SVE_1_Op_Imm_Log_All_Active(NAME)>; + def : SVE_1_Op_Imm_Log_All_Active(NAME)>; + def : SVE_1_Op_Imm_Log_All_Active(NAME)>; + def : SVE_1_Op_Imm_Log_All_Active(NAME)>; def : InstAlias(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>; @@ -4037,10 +4050,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperator op> { @@ -4049,10 +4062,10 @@ def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm2 { @@ -4061,10 +4074,10 @@ def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -110,6 +110,52 @@ ret %out } +; As smax_i32 but where pg is i8 based and thus compatible for i32. +define @smax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_b: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i16 based and thus compatible for i32. +define @smax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_h: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} ; SMIN @@ -220,6 +266,53 @@ ret %out } +; As smin_i32 but where pg is i8 based and thus compatible for i32. +define @smin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_b: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i16 based and thus compatible for i32. +define @smin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_h: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMAX define @umax_i8( %a) { @@ -329,6 +422,53 @@ ret %out } +; As umax_i32 but where pg is i8 based and thus compatible for i32. +define @umax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_b: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i16 based and thus compatible for i32. +define @umax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_h: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMIN define @umin_i8( %a) { @@ -438,6 +578,53 @@ ret %out } +; As umin_i32 but where pg is i8 based and thus compatible for i32. +define @umin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_b: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i16 based and thus compatible for i32. +define @umin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_h: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; SQADD define @sqadd_b_lowimm( %a) { @@ -1321,6 +1508,57 @@ ret %out } +; +; +; + +; As mul_i32 but where pg is i8 based and thus compatible for i32. +define @mul_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_b: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i16 based and thus compatible for i32. +define @mul_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_h: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @mul_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + declare @llvm.aarch64.sve.sqadd.x.nxv16i8(, ) declare @llvm.aarch64.sve.sqadd.x.nxv8i16(, ) declare @llvm.aarch64.sve.sqadd.x.nxv4i32(, ) @@ -1376,6 +1614,21 @@ declare @llvm.aarch64.sve.lsr.nxv4i32(, , ) declare @llvm.aarch64.sve.lsr.nxv2i64(, , ) +declare @llvm.aarch64.sve.mul.nxv16i8(, , ) +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll @@ -0,0 +1,237 @@ +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; +; AND +; + +define @and_i8( %a) #0 { +; CHECK-LABEL: and_i8: +; CHECK: and z0.b, z0.b, #0x7 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 7, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @and_i16( %a) #0 { +; CHECK-LABEL: and_i16: +; CHECK: and z0.h, z0.h, #0xf0 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 240, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @and_i32( %a) #0 { +; CHECK-LABEL: and_i32: +; CHECK: and z0.s, z0.s, #0xffff00 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 16776960, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @and_i64( %a) #0 { +; CHECK-LABEL: and_i64: +; CHECK: and z0.d, z0.d, #0xfffc000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 18445618173802708992, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.and.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; EOR +; + +define @eor_i8( %a) #0 { +; CHECK-LABEL: eor_i8: +; CHECK: eor z0.b, z0.b, #0xf +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 15, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @eor_i16( %a) #0 { +; CHECK-LABEL: eor_i16: +; CHECK: eor z0.h, z0.h, #0xfc07 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 64519, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @eor_i32( %a) #0 { +; CHECK-LABEL: eor_i32: +; CHECK: eor z0.s, z0.s, #0xffff00 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 16776960, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @eor_i64( %a) #0 { +; CHECK-LABEL: eor_i64: +; CHECK: eor z0.d, z0.d, #0x1000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 281474976710656, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.eor.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; ORR +; + +define @orr_i8( %a) #0 { +; CHECK-LABEL: orr_i8: +; CHECK: orr z0.b, z0.b, #0x6 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i8 6, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @orr_i16( %a) #0 { +; CHECK-LABEL: orr_i16: +; CHECK: orr z0.h, z0.h, #0x8001 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i16 32769, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @orr_i32( %a) #0 { +; CHECK-LABEL: orr_i32: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i32 65535, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @orr_i64( %a) #0 { +; CHECK-LABEL: orr_i64: +; CHECK: orr z0.d, z0.d, #0x7ffc000000000000 +; CHECK-NEXT: ret + %pg = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %b = shufflevector insertelement ( undef, i64 9222246136947933184, i32 0), undef, zeroinitializer + %out = call @llvm.aarch64.sve.orr.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i8 based and thus compatible for i32. +define @orr_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_b: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i16 based and thus compatible for i32. +define @orr_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_h: +; CHECK: orr z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As orr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @orr_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: orr_i32_ptrue_all_d: +; CHECK-DAG: mov [[IMM:w[0-9]+]], #65535 +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, [[IMM]] +; CHECK-DAG: orr z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 65535) + %out = tail call @llvm.aarch64.sve.orr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.and.nxv16i8(, , ) +declare @llvm.aarch64.sve.and.nxv8i16(, , ) +declare @llvm.aarch64.sve.and.nxv4i32(, , ) +declare @llvm.aarch64.sve.and.nxv2i64(, , ) + +declare @llvm.aarch64.sve.eor.nxv16i8(, , ) +declare @llvm.aarch64.sve.eor.nxv8i16(, , ) +declare @llvm.aarch64.sve.eor.nxv4i32(, , ) +declare @llvm.aarch64.sve.eor.nxv2i64(, , ) + +declare @llvm.aarch64.sve.orr.nxv16i8(, , ) +declare @llvm.aarch64.sve.orr.nxv8i16(, , ) +declare @llvm.aarch64.sve.orr.nxv4i32(, , ) +declare @llvm.aarch64.sve.orr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +attributes #0 = { "target-features"="+sve" }