diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5860,24 +5860,28 @@ defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; -// Patterns for uaddv(uaddlp(x)) ==> uaddlv -def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, - (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), - (i64 0))), (i64 0))), - (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), - (UADDLVv8i8v V64:$op), hsub), ssub)>; -def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp - (v16i8 V128:$op))))), (i64 0))), - (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), - (UADDLVv16i8v V128:$op), hsub), ssub)>; -def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>; - -// Patterns for addp(uaddlp(x))) ==> uaddlv -def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))), - (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>; -def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))), - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>; +multiclass SIMDAcrossLaneLongPairIntrinsic { + // Patterns for addv(addlp(x)) ==> addlv + def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, + (v4i16 (AArch64uaddv (v4i16 (addlp (v8i8 V64:$op))))), + (i64 0))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast(Opc#"v8i8v") V64:$op), hsub), ssub)>; + def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (addlp (v16i8 V128:$op))))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast(Opc#"v16i8v") V128:$op), hsub), ssub)>; + def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast(Opc#"v8i16v") V128:$op), ssub)>; + + // Patterns for addp(addlp(x))) ==> addlv + def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))), + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast(Opc#"v4i16v") V64:$op), ssub)>; + def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (!cast(Opc#"v4i32v") V128:$op), dsub)>; +} + +defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>; +defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>; // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll @@ -90,8 +90,7 @@ ; CHECK-LABEL: saddlv4h_from_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: saddlp v0.4h, v0.8b -; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: saddlv h0, v0.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A @@ -104,8 +103,7 @@ ; CHECK-LABEL: saddlv16b_from_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: saddlp v0.8h, v0.16b -; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: saddlv h0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A @@ -118,8 +116,7 @@ ; CHECK-LABEL: saddlv8h_from_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: saddlp v0.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: saddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A @@ -132,8 +129,7 @@ ; CHECK-LABEL: saddlv4s_from_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A @@ -146,8 +142,7 @@ ; CHECK-LABEL: saddlv4h_from_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: saddlp v0.2s, v0.4h -; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: saddlv s0, v0.4h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -28,8 +28,7 @@ define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) { ; CHECK-LABEL: add_v4i32_v4i64_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -79,8 +78,7 @@ define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i32_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: saddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -177,8 +175,7 @@ ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -295,8 +292,7 @@ ; CHECK-BASE-LABEL: add_v8i8_v8i32_sext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: saddlv s0, v0.8h ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret ; @@ -596,8 +592,7 @@ define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v4i32_v4i64_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlv d0, v0.4s ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret @@ -655,8 +650,7 @@ define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: saddlv s0, v0.8h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret @@ -768,8 +762,7 @@ ; CHECK-LABEL: add_v4i16_v4i64_acc_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlv d0, v0.4s ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret @@ -901,8 +894,7 @@ ; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: saddlv s0, v0.8h ; CHECK-BASE-NEXT: fmov w8, s0 ; CHECK-BASE-NEXT: add w0, w8, w0 ; CHECK-BASE-NEXT: ret @@ -974,8 +966,7 @@ define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.8h, v0.16b -; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: saddlv h0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: sxth w0, w8 @@ -1695,10 +1686,8 @@ define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i16_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.8h, v0.16b -; CHECK-NEXT: saddlp v1.8h, v1.16b -; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: saddlv h0, v0.16b +; CHECK-NEXT: saddlv h1, v1.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: add w8, w8, w9