diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -948,6 +948,13 @@ llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<4>]>; + class AdvSIMD_SVE_DUP_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMVectorElementType<0>], + [IntrNoMem]>; + class AdvSIMD_SVE_EXPA_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMVectorOfBitcastsToInt<0>], @@ -1225,6 +1232,12 @@ def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic; // +// Scalar to vector operations +// + +def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; + +// // Integer arithmetic // @@ -1235,6 +1248,7 @@ def int_aarch64_sve_pmul : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_sve_mul : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_mul_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; def int_aarch64_sve_smulh : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_umulh : AdvSIMD_Pred2VectorArg_Intrinsic; @@ -1253,7 +1267,9 @@ def int_aarch64_sve_mad : AdvSIMD_Pred3VectorArg_Intrinsic; def int_aarch64_sve_msb : AdvSIMD_Pred3VectorArg_Intrinsic; def int_aarch64_sve_mla : AdvSIMD_Pred3VectorArg_Intrinsic; +def int_aarch64_sve_mla_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; def int_aarch64_sve_mls : AdvSIMD_Pred3VectorArg_Intrinsic; +def int_aarch64_sve_mls_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; def int_aarch64_sve_saddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; def int_aarch64_sve_uaddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -215,6 +215,8 @@ PTEST, PTRUE, + DUP_PRED, + LDNF1, LDNF1S, LDFF1, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1425,6 +1425,7 @@ case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; case AArch64ISD::STNP: return "AArch64ISD::STNP"; + case AArch64ISD::DUP_PRED: return "AArch64ISD::DUP_PRED"; } return nullptr; } @@ -10917,6 +10918,18 @@ return SDValue(); } +static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + SDValue Scalar = N->getOperand(3); + EVT ScalarTy = Scalar.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); + + return DAG.getNode(AArch64ISD::DUP_PRED, dl, N->getValueType(0), + N->getOperand(1), N->getOperand(2), Scalar); +} + static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); LLVMContext &Ctx = *DAG.getContext(); @@ -11105,6 +11118,8 @@ return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); + case Intrinsic::aarch64_sve_dup: + return LowerSVEIntrinsicDUP(N, DAG); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); case Intrinsic::aarch64_sve_cmpeq_wide: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -96,6 +96,9 @@ def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>, SDTCVecEltisVT<2,i1>]>; +def AArch64dup_pred : SDNode<"AArch64ISD::DUP_PRED", SDT_AArch64DUP_PRED>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; @@ -287,8 +290,8 @@ defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) - defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">; - defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">; + defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_pred>; + defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_pred>; // Select elements from either vector (predicated) defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; @@ -1396,8 +1399,8 @@ let Predicates = [HasSVE2] in { // SVE2 integer multiply-add (indexed) - defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", null_frag>; - defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", null_frag>; + defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; + defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>; // SVE2 saturating multiply-add high (indexed) defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>; @@ -1408,7 +1411,7 @@ defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>; // SVE2 integer multiply (indexed) - defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", null_frag>; + defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>; // SVE2 saturating multiply high (indexed) defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5603,7 +5603,7 @@ let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_r { +multiclass sve_int_perm_cpy_r { def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>; def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>; def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>; @@ -5617,6 +5617,11 @@ (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Rn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve_int_perm_cpy_v sz8_64, string asm, ZPRRegOp zprty, @@ -5640,7 +5645,7 @@ let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_v { +multiclass sve_int_perm_cpy_v { def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>; def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>; def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>; @@ -5654,6 +5659,11 @@ (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Vn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve_int_perm_compact diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll @@ -0,0 +1,83 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; DUP +; + +define @dup_i8( %a, %pg, i8 %b) { +; CHECK-LABEL: dup_i8: +; CHECK: mov z0.b, p0/m, w0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv16i8( %a, + %pg, + i8 %b) + ret %out +} + +define @dup_i16( %a, %pg, i16 %b) { +; CHECK-LABEL: dup_i16: +; CHECK: mov z0.h, p0/m, w0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8i16( %a, + %pg, + i16 %b) + ret %out +} + +define @dup_i32( %a, %pg, i32 %b) { +; CHECK-LABEL: dup_i32: +; CHECK: mov z0.s, p0/m, w0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv4i32( %a, + %pg, + i32 %b) + ret %out +} + +define @dup_i64( %a, %pg, i64 %b) { +; CHECK-LABEL: dup_i64: +; CHECK: mov z0.d, p0/m, x0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv2i64( %a, + %pg, + i64 %b) + ret %out +} + +define @dup_f16( %a, %pg, half %b) { +; CHECK-LABEL: dup_f16: +; CHECK: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8f16( %a, + %pg, + half %b) + ret %out +} + +define @dup_f32( %a, %pg, float %b) { +; CHECK-LABEL: dup_f32: +; CHECK: mov z0.s, p0/m, s1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv4f32( %a, + %pg, + float %b) + ret %out +} + +define @dup_f64( %a, %pg, double %b) { +; CHECK-LABEL: dup_f64: +; CHECK: mov z0.d, p0/m, d1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv2f64( %a, + %pg, + double %b) + ret %out +} + +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) +declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) +declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) +declare @llvm.aarch64.sve.dup.nxv4f32(, , float) +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-mul-lane.ll @@ -0,0 +1,119 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; MUL +; + +define @mul_lane_d( %a, %b) { +; CHECK-LABEL: mul_lane_d: +; CHECK: mul z0.d, z0.d, z1.d[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mul.lane.nxv2i64( %a, + %b, + i32 1) + ret %out +} + +define @mul_lane_s( %a, %b) { +; CHECK-LABEL: mul_lane_s: +; CHECK: mul z0.s, z0.s, z1.s[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mul.lane.nxv4i32( %a, + %b, + i32 1) + ret %out +} + +define @mul_lane_h( %a, %b) { +; CHECK-LABEL: mul_lane_h: +; CHECK: mul z0.h, z0.h, z1.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mul.lane.nxv8i16( %a, + %b, + i32 1) + ret %out +} + +; +; MLA +; + +define @mla_lane_d( %a, %b, %c) { +; CHECK-LABEL: mla_lane_d: +; CHECK: mla z0.d, z1.d, z2.d[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mla.lane.nxv2i64( %a, + %b, + %c, + i32 1) + ret %out +} + +define @mla_lane_s( %a, %b, %c) { +; CHECK-LABEL: mla_lane_s: +; CHECK: mla z0.s, z1.s, z2.s[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mla.lane.nxv4i32( %a, + %b, + %c, + i32 1) + ret %out +} + +define @mla_lane_h( %a, %b, %c) { +; CHECK-LABEL: mla_lane_h: +; CHECK: mla z0.h, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mla.lane.nxv8i16( %a, + %b, + %c, + i32 1) + ret %out +} + +; +; MLS +; + +define @mls_lane_d( %a, %b, %c) { +; CHECK-LABEL: mls_lane_d: +; CHECK: mls z0.d, z1.d, z2.d[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mls.lane.nxv2i64( %a, + %b, + %c, + i32 1) + ret %out +} + +define @mls_lane_s( %a, %b, %c) { +; CHECK-LABEL: mls_lane_s: +; CHECK: mls z0.s, z1.s, z2.s[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mls.lane.nxv4i32( %a, + %b, + %c, + i32 1) + ret %out +} + +define @mls_lane_h( %a, %b, %c) { +; CHECK-LABEL: mls_lane_h: +; CHECK: mls z0.h, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.mls.lane.nxv8i16( %a, + %b, + %c, + i32 1) + ret %out +} + +declare @llvm.aarch64.sve.mul.lane.nxv8i16(, , i32) +declare @llvm.aarch64.sve.mul.lane.nxv4i32(, , i32) +declare @llvm.aarch64.sve.mul.lane.nxv2i64(, , i32) +declare @llvm.aarch64.sve.mla.lane.nxv8i16(, , , i32) +declare @llvm.aarch64.sve.mla.lane.nxv4i32(, , , i32) +declare @llvm.aarch64.sve.mla.lane.nxv2i64(, , , i32) +declare @llvm.aarch64.sve.mls.lane.nxv8i16(, , , i32) +declare @llvm.aarch64.sve.mls.lane.nxv4i32(, , , i32) +declare @llvm.aarch64.sve.mls.lane.nxv2i64(, , , i32)