This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
1/2
AArch64MIPeepholeOpt.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
aarch64-smull.ll

Differential D150969

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2
ClosedPublic

Authored by jaykang10 on May 19 2023, 7:51 AM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma
t.p.northover

Commits

rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1.

Summary

gcc generates less instructions than llvm from below intrinsic example.

#include <arm_neon.h>

void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_s16(acc, vget_high_s16(a), vget_high_s16(b));
    *out = acc;
}

GCC output

foo:
        ldp     q2, q3, [x1]
        uzp1    v2.8h, v2.8h, v3.8h
        smlsl   v1.4s, v0.4h, v2.4h
        smlsl2  v1.4s, v0.8h, v2.8h
        str     q1, [x0]
        ret

LLVM output

ldp     q2, q3, [x1]
ext     v4.16b, v0.16b, v0.16b, #8
xtn     v2.4h, v2.4s
smlsl   v1.4s, v0.4h, v2.4h
xtn     v0.4h, v3.4s
smlsl   v1.4s, v4.4h, v0.4h
str     q1, [x0]
ret

It looks gcc keeps the intrinsic function calls with builtin function calls.
For example, the vmonv and vcombine intrinsic function calls are matched to the uzp1 pattern as below.

_4 = __builtin_aarch64_xtnv4si (_3);(insn 9 8 10 (set (reg:V4SI 107)
_6 = __builtin_aarch64_xtnv4si (_5);(insn 12 11 13 (set (reg:V4SI 109)
_7 = {_4, _6};
...
(insn 10 9 11 (set (reg:V8HI 108)
        (vec_concat:V8HI (truncate:V4HI (reg:V4SI 107))
            (const_vector:V4HI [
                    (const_int 0 [0]) repeated x4
                ])))
     (nil))
(insn 11 10 0 (set (reg:V4HI 93 [ _5 ])
        (subreg:V4HI (reg:V8HI 108) 0))
     (nil))
(insn 13 12 14 (set (reg:V8HI 110)
        (vec_concat:V8HI (truncate:V4HI (reg:V4SI 109))
            (const_vector:V4HI [
                    (const_int 0 [0]) repeated x4
                ])))
     (nil))
(insn 14 13 0 (set (reg:V4HI 95 [ _7 ])
        (subreg:V4HI (reg:V8HI 110) 0))
     (nil))
(insn 15 14 16 (set (reg:V8HI 111)
        (vec_concat:V8HI (reg:V4HI 93 [ _5 ])
            (reg:V4HI 95 [ _7 ])))
     (nil))
...
(define_insn "*aarch64_narrow_trunc<mode>"
  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
        (vec_concat:<VNARROWQ2>
          (truncate:<VNARROWQ>
            (match_operand:VQN 1 "register_operand" "w"))
          (truncate:<VNARROWQ>
            (match_operand:VQN 2 "register_operand" "w"))))]
  "TARGET_SIMD"
{
  if (!BYTES_BIG_ENDIAN)
    return "uzp1\\t%0.<V2ntype>, %1.<V2ntype>, %2.<V2ntype>";
  else
    return "uzp1\\t%0.<V2ntype>, %2.<V2ntype>, %1.<V2ntype>";
}
  [(set_attr "type" "neon_permute<q>")]
)

It looks clang generates some intrinsic functions' deifintion. After inlining, some intrinsic function calls are optimized away as below.

define dso_local void @foo(<8 x i16> noundef %a, <4 x i32> noundef %acc, ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
entry:
  %0 = load <4 x i32>, ptr %p, align 4
  %vmovn.i = trunc <4 x i32> %0 to <4 x i16>
  %arrayidx2 = getelementptr inbounds i32, ptr %p, i64 4
  %1 = load <4 x i32>, ptr %arrayidx2, align 4
  %vmovn.i17 = trunc <4 x i32> %1 to <4 x i16>
  %shuffle.i18 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i18, <4 x i16> %vmovn.i)
  %shuffle.i19 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %vmull2.i.i20 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i19, <4 x i16> %vmovn.i17)
  %2 = add <4 x i32> %vmull2.i.i, %vmull2.i.i20
  %sub.i21 = sub <4 x i32> %acc, %2
  store <4 x i32> %sub.i21, ptr %out, align 16, !tbaa !6
  ret void
}

For uzp1 instruction, it is hard to match existing pattern for uzp1 without concat_vectors which comes from vcombine_s16.
If clang does not generate the intrinsic function's definition and backend lowers the intrinsic function call, we could see similar code with gcc. However, I do not think it is good way. It could be better to generate the intrinsic function's definition and optimize the code after inlining.

Alternatively, I have tried to check the MIR code sequence with smlsl in AArch64MIPeepholeOpt pass. With this patch, the llvm output is as below.

foo:
        ldp     q2, q3, [x1]
        uzp1    v2.8h, v2.8h, v3.8h
        smlsl   v1.4s, v0.4h, v2.4h
        smlsl2  v1.4s, v0.8h, v2.8h
        str     q1, [x0]
        ret

Diff Detail

Event Timeline

jaykang10 created this revision.May 19 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 19 2023, 7:51 AM

Herald added subscribers: jeroen.dobbelaere, hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.May 19 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 19 2023, 7:51 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B233179: Diff 523775.May 19 2023, 7:52 AM

Consider the following:

#include <arm_neon.h>

void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

void foo2(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vuzp1q_s16(vreinterpretq_s16_s32(vld1q_s32(&p[0])),
                             vreinterpretq_s16_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

void foo3(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    acc = vmlsl_s16(acc, vget_low_s16(a), vmovn_s32(vld1q_s32(&p[0])));
    acc = vmlsl_s16(acc, vget_high_s16(a), vmovn_s32(vld1q_s32(&p[4])));
    *out = acc;
}

foo() is your original testcase; foo2() is modified to use intrinsics that more closely match the expected sequence, foo3 is modified to get rid of the redundant vcombine/vget pair. clang and gcc generate essentially the same code for foo2() and foo3(); somehow the way foo() is written tickles some combine in gcc that makes it treat it like foo2 instead of foo3.

It looks like your patch fixes the code for both foo2 and foo3; is that right?

Can we generalize this to optimize the following? Maybe split the transform into two steps: one to optimize the following, then one to optimize any remaining extra instructions?

void foo4(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

Can we generalize this to handle other widening instructions that use the high half of the inputs?

Any thoughts on a DAGCombine vs. MIPeepholeOpt?

@efriedma Thanks for your kind comment.

foo() is your original testcase; foo2() is modified to use intrinsics that more closely match the expected sequence, foo3 is modified to get rid of the redundant vcombine/vget pair. clang and gcc generate essentially the same code for foo2() and foo3(); somehow the way foo() is written tickles some combine in gcc that makes it treat it like foo2 instead of foo3.

Yep, I agree with you.
I have already told the team it would be good to use the vuzp1q_s16 intrinsic directly for the example foo than expecting optimization from compiler... but the team wants llvm to support the example like gcc as well as using the vuzp1q_s16...

It looks like your patch fixes the code for both foo2 and foo3; is that right?

The patch was to fix the foo but it looks the foo3 is also affected by this patch because it generates the mir sequence xtn + xtn + smlsl + smlsl.

Can we generalize this to optimize the following? Maybe split the transform into two steps: one to optimize the following, then one to optimize any remaining extra instructions?
void foo4(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

um... the LLVM IR snippet before/after inlining output is as below.

Before inlining
define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr noundef %2, ptr noundef %3) #0 {
  %5 = load <4 x i32>, ptr %3, align 4
  %6 = call <4 x i16> @vmovn_s32(<4 x i32> noundef %5)
  %7 = getelementptr inbounds i32, ptr %3, i64 4 
  %8 = load <4 x i32>, ptr %7, align 4
  %9 = call <4 x i16> @vmovn_s32(<4 x i32> noundef %8)
  %10 = call <8 x i16> @vcombine_s16(<4 x i16> noundef %6, <4 x i16> noundef %9)
  %11 = call <4 x i32> @vmlsl_high_s16(<4 x i32> noundef %1, <8 x i16> noundef %0, <8 x i16> noundef %10)
  store <4 x i32> %11, ptr %2, align 16, !tbaa !6
  ret void
}

After inlining
define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr noundef %2, ptr noundef %3) local_unnamed_addr #0 {
  %5 = load <4 x i32>, ptr %3, align 4
  %6 = trunc <4 x i32> %5 to <4 x i16>
  %7 = getelementptr inbounds i32, ptr %3, i64 4 
  %8 = load <4 x i32>, ptr %7, align 4
  %9 = trunc <4 x i32> %8 to <4 x i16>
  %10 = shufflevector <4 x i16> %6, <4 x i16> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %11 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %12 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %11, <4 x i16> %9)
  %13 = sub <4 x i32> %1, %12
  store <4 x i32> %13, ptr %2, align 16, !tbaa !6
  ret void
}

As you can see, after inlining, the %10 = shufflevector is redundant so it is removed as below in the end.

define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr nocapture noundef writeonly %2, ptr nocapture noun
def readonly %3) local_unnamed_addr #0 {
  %5 = getelementptr inbounds i32, ptr %3, i64 4
  %6 = load <4 x i32>, ptr %5, align 4
  %7 = trunc <4 x i32> %6 to <4 x i16>
  %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
  %10 = sub <4 x i32> %1, %9
  store <4 x i32> %10, ptr %2, align 16, !tbaa !6
  ret void
}

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

Legalized selection DAG: %bb.0 'foo4:entry'
SelectionDAG has 20 nodes:
  t0: ch,glue = EntryToken
      t8: i64,ch = CopyFromReg t0, Register:i64 %3
    t10: i64 = add nuw t8, Constant:i64<16>
  t13: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t10, undef:i64
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
          t17: v4i16 = extract_subvector t2, Constant:i64<4>
          t14: v4i16 = truncate t13
        t24: v4i32 = AArch64ISD::SMULL t17, t14
      t21: v4i32 = sub t4, t24
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t22: ch = store<(store (s128) into %ir.out, !tbaa !6)> t13:1, t21, t6, undef:i64
  t23: ch = AArch64ISD::RET_GLUE t22

Can we generalize this to handle other widening instructions that use the high half of the inputs?

I think so.
The main issue is to generate uzp1. The smlsl is like a target node to detect the code sequence for uzp1 so I think we could cover similar cases more.

Any thoughts on a DAGCombine vs. MIPeepholeOpt?

The foo's legalized DAG is as below.

Legalized selection DAG: %bb.0 'foo:entry'
SelectionDAG has 27 nodes:
  t0: ch,glue = EntryToken
  t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
  t8: i64,ch = CopyFromReg t0, Register:i64 %3
  t11: v4i32,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
    t14: i64 = add nuw t8, Constant:i64<16>
  t15: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t14, undef:i64
      t27: ch = TokenFactor t11:1, t15:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t18: v4i16 = extract_subvector t2, Constant:i64<0>
            t12: v4i16 = truncate t11
          t31: v4i32 = AArch64ISD::SMULL t18, t12
            t23: v4i16 = extract_subvector t2, Constant:i64<4>
            t16: v4i16 = truncate t15
          t30: v4i32 = AArch64ISD::SMULL t23, t16
        t25: v4i32 = add t31, t30
      t26: v4i32 = sub t4, t25
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t28: ch = store<(store (s128) into %ir.out, !tbaa !6)> t27, t26, t6, undef:i64
  t29: ch = AArch64ISD::RET_GLUE t28

With t25: v4i32 = add t31, t30, we could do dagcombine as below because we do not generate custom node for smlsl2 in DAG level. I think it is also not simple...

t0: ch,glue = EntryToken
t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
t8: i64,ch = CopyFromReg t0, Register:i64 %3
t11: v8i16,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
  t13: i64 = add nuw t8, Constant:i64<16>
t14: v8i16,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t13, undef:i64
t34: v8i16 = AArch64ISD::UZP1 t11, t14
    t28: ch = TokenFactor t11:1, t14:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
          t17: v4i16 = extract_subvector t2, Constant:i64<0>
          t19: v4i16 = extract_subvector t34, Constant:i64<0>
        t32: v4i32 = AArch64ISD::SMULL t17, t19
      t35: v4i32 = sub t4, t32
        t23: v4i16 = extract_subvector t2, Constant:i64<4>
        t24: v4i16 = extract_subvector t34, Constant:i64<4>
      t31: v4i32 = AArch64ISD::SMULL t23, t24
    t36: v4i32 = sub t35, t31
    t6: i64,ch = CopyFromReg t0, Register:i64 %2
  t29: ch = store<(store (s128) into %ir.out, !tbaa !6)> t28, t36, t6, undef:i64
t30: ch = AArch64ISD::RET_GLUE t29

With MIPeepholeOpt, it could be a bit simpler to add the other widening instructions that use the high half of the inputs... but I am not sure which one is better...

jaykang10 updated this revision to Diff 525567.May 25 2023, 5:58 AM

Harbormaster completed remote builds in B234483: Diff 525567.May 25 2023, 5:58 AM

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

We have something like "smull(trunc(x), extract_high(y))". That should be equivalent to "smull2(uzp1(undef,x), y)", I think?

In D150969#4373399, @efriedma wrote:

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

We have something like "smull(trunc(x), extract_high(y))". That should be equivalent to "smull2(uzp1(undef,x), y)", I think?

Ah, ok.
I missed the undef. Let me try to use it.

Following @efriedma's comment, updated patch with DAGCombine

Harbormaster completed remote builds in B235521: Diff 526966.May 31 2023, 3:22 AM

Sounds good. Is the idea to expand this to check the uses of the original EXTRACT_SUBVECTOR's operands to see if there is another long mul that can use the other operand of the uzip?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22417 ↗	(On Diff #526966)	Can this use maybe isSplatValue, to avoid the call to LowerOperation.
22426 ↗	(On Diff #526966)	It looks like LHS/RHS could be a bitcast from isEssentiallyExtractHighSubvector.

In D150969#4383791, @dmgreen wrote:

Sounds good. Is the idea to expand this to check the uses of the original EXTRACT_SUBVECTOR's operands to see if there is another long mul that can use the other operand of the uzip?

Yep, as @efriedma suggested, first, we handle the mul high with uzp1 and then we will try to handle other instructions based on the uzp1.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22417 ↗	(On Diff #526966)	Yep, you are right. Let me update it.
22426 ↗	(On Diff #526966)	Yep, you are right. Let me update it.

jaykang10 updated this revision to Diff 527022.May 31 2023, 7:01 AM

Harbormaster completed remote builds in B235561: Diff 527022.May 31 2023, 7:53 AM

Based on the uzp1 from dagcombine, if uzp1 has IMPLICIT_DEF as low 64-bit operand, we can replace it with xtn's operand.
For example,

%4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4)
%5:fpr64 = XTNv4i16 killed %4:fpr128
%6:fpr64 = COPY %0.dsub:fpr128
%7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4)
%9:fpr128 = IMPLICIT_DEF
%8:fpr128 = UZP1v8i16 killed %9:fpr128, killed %7:fpr128
%10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %5:fpr64
%11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128
==>
%4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4)
%6:fpr64 = COPY %0.dsub:fpr128
%7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4)
%8:fpr128 = UZP1v8i16 killed %4:fpr128, killed %7:fpr128 
%12:fpr64 = COPY %8.dsub:fpr128
%10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %12:fpr64 
%11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128

If you are ok with these approach, it could be good to split this patch into two patches which are dagcombine one and MIPeephole one.

Harbormaster completed remote builds in B235842: Diff 527416.Jun 1 2023, 8:45 AM

dmgreen added inline comments.Jun 5 2023, 1:04 AM

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
727	This (and maybe the distance checks below) would make the algorithm O(N^2) in the number of instructions in the block. It does allow the algorithm to be quite general - it can match any truncate with the UZP getting a free truncate for what may be an unrelated instruction. It may not always be valid though - Could the truncate depend on result of the UZP or vice-versa? It does have the advantage that it works with either SDAG or GlobalISel though. From what I have seen mull's often come in pairs. For example the code in smlsl_smlsl2_v4i32_uzp1 has: ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h If it was processing the smlsl2, it might be able to look at the extract high of the first operand, see that it has 2 uses with the other being an smull(extractlow(.)), and use the other operand of the smull in the UZP instead of the undef when creating it in DAG? It has to check a number of things (and doesn't help with globalisel), but hopefully fits in as an extension to the existing code in SDAG.

jaykang10 added inline comments.Jun 5 2023, 2:06 AM

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

727

um... as you can see on my first patch in this review, I checked the smlsl2's first operand's is smlsl in MIPeephole opt.
@efriedma pointed out the patch fixes the specific case and suggested to generalize the case. In order to generalize case, he suggested to split the issue into two cases so I tried to fix them in dagcombine and MIPeephole opt.
At this moment, AArch64 target does not have smlsl smlsl2 custom node in DAG so we need to detect the node patterns such as sub, add, SMULL and extract_subvector from below dag.

Legalized selection DAG: %bb.0 'foo:entry'
SelectionDAG has 27 nodes:
  t0: ch,glue = EntryToken
  t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
  t8: i64,ch = CopyFromReg t0, Register:i64 %3
  t11: v4i32,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
    t14: i64 = add nuw t8, Constant:i64<16>
  t15: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t14, undef:i64
      t27: ch = TokenFactor t11:1, t15:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t18: v4i16 = extract_subvector t2, Constant:i64<0>
            t12: v4i16 = truncate t11
          t31: v4i32 = AArch64ISD::SMULL t18, t12
            t23: v4i16 = extract_subvector t2, Constant:i64<4>
            t16: v4i16 = truncate t15
          t30: v4i32 = AArch64ISD::SMULL t23, t16
        t25: v4i32 = add t31, t30
      t26: v4i32 = sub t4, t25
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t28: ch = store<(store (s128) into %ir.out, !tbaa !6)> t27, t26, t6, undef:i64
  t29: ch = AArch64ISD::RET_GLUE t28

I am not sure which approach is better. @efriedma How do you think about it?
Anyway, let me try to implement it in dagcombine.

Following @dmgreen's comment, the code is implemented in dagcombine.

Harbormaster completed remote builds in B237219: Diff 529235.Jun 7 2023, 4:22 AM

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.
That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.
The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.
The smull/umull/pmull's other operand in a trunc.
We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

You can see the case with sub and add from pmlsl_pmlsl2_v8i16_uzp1 in this patch.

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

Anyway, I do not know your idea will work with the cases well or not before implementing it.
If your idea is acceptable, I do not mind the complexity and try to implement it because you and @ktkachov want to solve this issue.

In D150969#4405308, @jaykang10 wrote:

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

Anyway, I do not know your idea will work with the cases well or not before implementing it.
If your idea is acceptable, I do not mind the complexity and try to implement it because you and @ktkachov want to solve this issue.

FWIW I think it makes sense to solve this generally so it may be worth investing some time to make it work

If you wanted to go back to this version with tryCombineOpWithUZP1: https://reviews.llvm.org/D150969?id=527022, and get that in and committed I think that would make a lot of sense. It should be beneficial on its own and we can then figure out a way of reusing the uzp1 ontop of it.

In D150969#4405763, @dmgreen wrote:

If you wanted to go back to this version with tryCombineOpWithUZP1: https://reviews.llvm.org/D150969?id=527022, and get that in and committed I think that would make a lot of sense. It should be beneficial on its own and we can then figure out a way of reusing the uzp1 ontop of it.

I am checking and implementing what you wrote.

Following @dmgreen's comment, updated code.

@dmgreen If I misunderstood what you wrote, let me know. I will implement it again.

Harbormaster completed remote builds in B237716: Diff 529900.Jun 9 2023, 5:12 AM

dmgreen added inline comments.Jun 12 2023, 2:12 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22381 ↗	(On Diff #529900)	Perhaps name this tryCombineMULLWithUZP1, to show it operands on mull nodes.
22437 ↗	(On Diff #529900)	If these or the conditions below fail, could it still create the UZP with an undef operand? Something like this in pseudocode. SDValue TRUNCLOWOP = DAG.getUndef(VT); if (.. find the other operand through the uses) // This is the complex bit TRUNCLOWOP = Found Other Op; UZP = DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TRUNCLOWOP, TRUNCHIGHOP); ReplaceUse(TRUNCHIGH, UZP EXTRACT_SUBVECTOR Hi). if (previouslyFoundOtherOp) ReplaceUse(TRUNCLOW, UZP EXTRACT_SUBVECTOR Lo). I believe that should then handle some of the other cases like efriedma mentioned.
22451 ↗	(On Diff #529900)	This could use isNullConstant
22492 ↗	(On Diff #529900)	The LLVM naming scheme would be TruncHighVT I think.

jaykang10 added inline comments.Jun 12 2023, 2:24 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22381 ↗	(On Diff #529900)	Let me update the name.
22437 ↗	(On Diff #529900)	Ah, I did not know you wanted to generate the uzp1 with undef. Let me update the code.
22451 ↗	(On Diff #529900)	Let me update it.
22492 ↗	(On Diff #529900)	Let me update the name.

Following @dmgreen's comment, updated code.

Harbormaster completed remote builds in B238144: Diff 530446.Jun 12 2023, 3:41 AM

Thanks this looks good I think.

Can you add a tests case where there is only an extract high created?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22442–22443 ↗	(On Diff #530446)	`for (SDNode *User : ExtractHighSrcVec.getNode()->uses())`
22464–22465 ↗	(On Diff #530446)	`SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();`
22469 ↗	(On Diff #530446)	This can be `ExtractLowUser->getOperand(0) == ExtractLow` I think?
22503 ↗	(On Diff #530446)	Should TruncHighVT be UZP1VT, as we know the type of the UPZ1? It may be better to recreate the constant with the correct value for UZP1VT, to make sure with the bitcast we don't get it wrong.

In D150969#4413629, @dmgreen wrote:

Thanks this looks good I think.

Can you add a tests case where there is only an extract high created?

Let me add the tests.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22442–22443 ↗	(On Diff #530446)	Let me update it.
22464–22465 ↗	(On Diff #530446)	Let me update it.
22469 ↗	(On Diff #530446)	Let me update it.
22503 ↗	(On Diff #530446)	Let me update it.

@dmgreen If you want to change something more, let me know.
Let me update the code.

Harbormaster completed remote builds in B238173: Diff 530484.Jun 12 2023, 6:17 AM

Thanks

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22446–22451 ↗	(On Diff #530484)	These two ifs could be combined into one
22460 ↗	(On Diff #530484)	I think this one could leave HasFoundMULLow = true but without a valid TruncLow.

jaykang10 added inline comments.Jun 13 2023, 2:08 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22446–22451 ↗	(On Diff #530484)	Let me update it.
22460 ↗	(On Diff #530484)	Let me update it.

ktkachov removed a subscriber: ktkachov.Jun 13 2023, 2:10 AM

Following @dmgreen's comments, updated code.

Harbormaster completed remote builds in B238432: Diff 530826.Jun 13 2023, 2:14 AM

dmgreen added inline comments.Jun 13 2023, 2:24 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22460 ↗	(On Diff #530484)	I meant - if we get to the start of this if with HasFoundMULLow=true (so ExtractLow is valid), but it doesn't have 1 use, then we get to the code below (`SDValue TruncLowOp = HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);`) with HasFoundMULLow=true but TruncLow not being a valid node. Maybe split the one use check out: if (!ExtractLow->hasOneUse()) HasFoundMULLow = false; // Check ExtractLow's user. if (HasFoundMULLow) {...

jaykang10 added inline comments.Jun 13 2023, 2:31 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22460 ↗	(On Diff #530484)	Ah, I understand what you meant now. Let me update it.

Following @dmgreen's comment, updated code.

Harbormaster completed remote builds in B238439: Diff 530838.Jun 13 2023, 2:47 AM

Oh yeah. Thanks. LGTM

This revision is now accepted and ready to land.Jun 13 2023, 5:12 AM

In D150969#4417139, @dmgreen wrote:

Oh yeah. Thanks. LGTM

Thanks for review.
Let me push this patch.

This revision was landed with ongoing or failed builds.Jun 13 2023, 6:35 AM

Closed by commit rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1. (authored by jaykang10). · Explain Why

This revision was automatically updated to reflect the committed changes.

jaykang10 added a commit: rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1..

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64MIPeepholeOpt.cpp

130 lines

test/

CodeGen/

AArch64/

aarch64-smull.ll

27 lines

Diff 523775

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines
// %6:fpr128 = IMPLICIT_DEF		// %6:fpr128 = IMPLICIT_DEF
// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub		// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0		// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
// ==>		// ==>
// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr		// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
// %6:fpr128 = IMPLICIT_DEF		// %6:fpr128 = IMPLICIT_DEF
// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub		// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
//		//
		// 8. If a SMLSL's first operand is SMLSL, there could be a chance for below
		// transformation.
		//
		// XTN + XTN + SMLSL + SMLSL ==> UZP1 + SMLSL + SMLSL2
		//
		// For example,
		//
		// %5:fpr64 = XTNv4i16 killed %4:fpr128
		// %7:fpr64 = XTNv4i16 killed %6:fpr128
		// %8:fpr64 = COPY %0.dsub:fpr128
		// %9:fpr128 = EXTv16i8 %0:fpr128, %0:fpr128, 8
		// %10:fpr64 = COPY %9.dsub:fpr128
		// %11:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %8:fpr64, killed %5:fpr64
		// %12:fpr128 = SMLSLv4i16_v4i32 %11:fpr128(tied-def 0), killed %10:fpr64, killed %7:fpr64
		// ==>
		// %6:fpr128 = UZP1v8i16 killed %4:fpr128, killed %5:fpr128
		// %7:fpr64 = COPY %0.dsub:fpr128
		// %8:fpr64 = COPY %6.dsub:fpr128
		// %9:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %7:fpr64, killed %8:fpr64
		// %10:fpr128 = SMLSLv8i16_v4i32 %9:fpr128(tied-def 0), %0:fpr128, %6:fpr128
		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AArch64ExpandImm.h"		#include "AArch64ExpandImm.h"
#include "AArch64InstrInfo.h"		#include "AArch64InstrInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"		#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineDominators.h"		#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"		#include "llvm/CodeGen/MachineLoopInfo.h"

▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);		bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);

template <typename T>		template <typename T>
bool visitAND(unsigned Opc, MachineInstr &MI);		bool visitAND(unsigned Opc, MachineInstr &MI);
bool visitORR(MachineInstr &MI);		bool visitORR(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);		bool visitINSERT(MachineInstr &MI);
bool visitINSviGPR(MachineInstr &MI, unsigned Opc);		bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
bool visitINSvi64lane(MachineInstr &MI);		bool visitINSvi64lane(MachineInstr &MI);
		bool visitSMLSLv4i16_v4i32(MachineInstr &MI);
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
return "AArch64 MI Peephole Optimization pass";		return "AArch64 MI Peephole Optimization pass";
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();		AU.setPreservesCFG();
▲ Show 20 Lines • Show All 526 Lines • ▼ Show 20 Lines	bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
Register OldDef = MI.getOperand(0).getReg();		Register OldDef = MI.getOperand(0).getReg();
Register NewDef = MI.getOperand(1).getReg();		Register NewDef = MI.getOperand(1).getReg();
MRI->replaceRegWith(OldDef, NewDef);		MRI->replaceRegWith(OldDef, NewDef);
MI.eraseFromParent();		MI.eraseFromParent();

return true;		return true;
}		}

		bool AArch64MIPeepholeOpt::visitSMLSLv4i16_v4i32(MachineInstr &MI) {
		// If SMLSL's first operand is SMLSL, there could be a chance for below
		// transformation.
		//
		// XTN + XTN + SMLSL + SMLSL ==> UZP1 + SMLSL + SMLSL2
		//
		// For example,
		//
		// %5:fpr64 = XTNv4i16 killed %4:fpr128
		// %7:fpr64 = XTNv4i16 killed %6:fpr128
		// %8:fpr64 = COPY %0.dsub:fpr128
		// %9:fpr128 = EXTv16i8 %0:fpr128, %0:fpr128, 8
		// %10:fpr64 = COPY %9.dsub:fpr128
		// %11:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %8:fpr64, killed %5:fpr64
		// %12:fpr128 = SMLSLv4i16_v4i32 %11:fpr128(tied-def 0), killed %10:fpr64, killed %7:fpr64
		// ==>
		// %6:fpr128 = UZP1v8i16 killed %4:fpr128, killed %5:fpr128
		// %7:fpr64 = COPY %0.dsub:fpr128
		// %8:fpr64 = COPY %6.dsub:fpr128
		// %9:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %7:fpr64, killed %8:fpr64
		// %10:fpr128 = SMLSLv8i16_v4i32 %9:fpr128(tied-def 0), %0:fpr128, %6:fpr128

		MachineInstr *SecondSMLSLMI = &MI;
		// Check SMLSL's first operand is SMLSL.
		MachineInstr *FirstSMLSLMI =
		MRI->getUniqueVRegDef(SecondSMLSLMI->getOperand(1).getReg());
		if (FirstSMLSLMI->getOpcode() != AArch64::SMLSLv4i16_v4i32)
		return false;

		// Check SMLSL's third operand is XTN.
		MachineInstr *FirstXTNMI =
		MRI->getUniqueVRegDef(FirstSMLSLMI->getOperand(3).getReg());
		if (FirstXTNMI->getOpcode() != AArch64::XTNv4i16)
		return false;
		dmgreenUnsubmitted Not Done Reply Inline Actions This (and maybe the distance checks below) would make the algorithm O(N^2) in the number of instructions in the block. It does allow the algorithm to be quite general - it can match any truncate with the UZP getting a free truncate for what may be an unrelated instruction. It may not always be valid though - Could the truncate depend on result of the UZP or vice-versa? It does have the advantage that it works with either SDAG or GlobalISel though. From what I have seen mull's often come in pairs. For example the code in smlsl_smlsl2_v4i32_uzp1 has: ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h If it was processing the smlsl2, it might be able to look at the extract high of the first operand, see that it has 2 uses with the other being an smull(extractlow(.)), and use the other operand of the smull in the UZP instead of the undef when creating it in DAG? It has to check a number of things (and doesn't help with globalisel), but hopefully fits in as an extension to the existing code in SDAG. dmgreen: This (and maybe the distance checks below) would make the algorithm O(N^2) in the number of…
		jaykang10AuthorUnsubmitted Done Reply Inline Actions um... as you can see on my first patch in this review, I checked the `smlsl2`'s first operand's is `smlsl` in MIPeephole opt. @efriedma pointed out the patch fixes the specific case and suggested to generalize the case. In order to generalize case, he suggested to split the issue into two cases so I tried to fix them in dagcombine and MIPeephole opt. At this moment, AArch64 target does not have `smlsl` `smlsl2` custom node in DAG so we need to detect the node patterns such as `sub`, `add`, `SMULL` and `extract_subvector` from below dag. Legalized selection DAG: %bb.0 'foo:entry' SelectionDAG has 27 nodes: t0: ch,glue = EntryToken t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0 t8: i64,ch = CopyFromReg t0, Register:i64 %3 t11: v4i32,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64 t14: i64 = add nuw t8, Constant:i64<16> t15: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t14, undef:i64 t27: ch = TokenFactor t11:1, t15:1 t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1 t18: v4i16 = extract_subvector t2, Constant:i64<0> t12: v4i16 = truncate t11 t31: v4i32 = AArch64ISD::SMULL t18, t12 t23: v4i16 = extract_subvector t2, Constant:i64<4> t16: v4i16 = truncate t15 t30: v4i32 = AArch64ISD::SMULL t23, t16 t25: v4i32 = add t31, t30 t26: v4i32 = sub t4, t25 t6: i64,ch = CopyFromReg t0, Register:i64 %2 t28: ch = store<(store (s128) into %ir.out, !tbaa !6)> t27, t26, t6, undef:i64 t29: ch = AArch64ISD::RET_GLUE t28 I am not sure which approach is better. @efriedma How do you think about it? Anyway, let me try to implement it in dagcombine. jaykang10: um... as you can see on my first patch in this review, I checked the `smlsl2`'s first operand's…

		MachineInstr *SecondXTNMI =
		MRI->getUniqueVRegDef(SecondSMLSLMI->getOperand(3).getReg());
		if (SecondXTNMI->getOpcode() != AArch64::XTNv4i16)
		return false;

		// Check second SMLSL's second operand is COPY an its operand is FPR128.
		MachineInstr *SecondSMLSLOp2MI =
		MRI->getUniqueVRegDef(SecondSMLSLMI->getOperand(2).getReg());
		if (SecondSMLSLOp2MI->getOpcode() != TargetOpcode::COPY)
		return false;
		if (SecondSMLSLOp2MI->getOperand(1).getSubReg() != AArch64::dsub)
		return false;

		// If the copy's operand MI is EXT and the EXT uses same operand for op1 and
		// op2, update SecondSMLSLOp2Reg.
		Register SecondSMLSLOp2Reg = SecondSMLSLOp2MI->getOperand(1).getReg();
		MachineInstr *EXTMI = MRI->getUniqueVRegDef(SecondSMLSLOp2Reg);
		if (EXTMI->getOpcode() == AArch64::EXTv16i8) {
		if (EXTMI->getOperand(1).getReg() == EXTMI->getOperand(2).getReg()) {
		SecondSMLSLOp2Reg = EXTMI->getOperand(1).getReg();
		EXTMI->eraseFromParent();
		}
		}

		// Let's create UZP1 from XTN + XTN.
		Register UZP1Low64Reg = FirstXTNMI->getOperand(1).getReg();
		Register UZP1High64Reg = SecondXTNMI->getOperand(1).getReg();
		Register UZP1DstReg = MRI->createVirtualRegister(&AArch64::FPR128RegClass);
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::UZP1v8i16),
		UZP1DstReg)
		.addUse(UZP1Low64Reg, getRegState(FirstXTNMI->getOperand(1)))
		.addUse(UZP1High64Reg, getRegState(SecondXTNMI->getOperand(1)));

		// Let's create new SMLSL.
		Register COPYLow64DstReg =
		MRI->createVirtualRegister(&AArch64::FPR64RegClass);
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
		COPYLow64DstReg)
		.addUse(UZP1DstReg, 0, AArch64::dsub);

		Register NewSMLSLDstReg = FirstSMLSLMI->getOperand(0).getReg();
		Register NewSMLSLOp1Reg = FirstSMLSLMI->getOperand(1).getReg();
		Register NewSMLSLOp2Reg = FirstSMLSLMI->getOperand(2).getReg();
		Register NewSMLSLOp3Reg = COPYLow64DstReg;
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
		TII->get(AArch64::SMLSLv4i16_v4i32), NewSMLSLDstReg)
		.addUse(NewSMLSLOp1Reg, getRegState(FirstSMLSLMI->getOperand(1)))
		.addUse(NewSMLSLOp2Reg, getRegState(FirstSMLSLMI->getOperand(2)))
		.addUse(NewSMLSLOp3Reg, getRegState(FirstSMLSLMI->getOperand(3)));

		// Let's create new SMLSL2.
		Register NewSMLSL2DstReg = SecondSMLSLMI->getOperand(0).getReg();
		Register NewSMLSL2Op1Reg = NewSMLSLDstReg;
		Register NewSMLSL2Op2Reg = SecondSMLSLOp2Reg;
		Register NewSMLSL2Op3Reg = UZP1DstReg;
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
		TII->get(AArch64::SMLSLv8i16_v4i32), NewSMLSL2DstReg)
		.addUse(NewSMLSL2Op1Reg, getRegState(SecondSMLSLMI->getOperand(1)))
		.addUse(NewSMLSL2Op2Reg, getRegState(SecondSMLSLMI->getOperand(2)))
		.addUse(NewSMLSL2Op3Reg, getRegState(SecondSMLSLMI->getOperand(3)));

		SecondSMLSLMI->eraseFromParent();
		FirstSMLSLMI->eraseFromParent();
		SecondSMLSLOp2MI->eraseFromParent();
		SecondXTNMI->eraseFromParent();
		FirstXTNMI->eraseFromParent();

		return true;
		}

bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {		bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))		if (skipFunction(MF.getFunction()))
return false;		return false;

TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());		TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = static_cast<const AArch64RegisterInfo *>(		TRI = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());		MF.getSubtarget().getRegisterInfo());
MLI = &getAnalysis<MachineLoopInfo>();		MLI = &getAnalysis<MachineLoopInfo>();
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	for (MachineInstr &MI : make_early_inc_range(MBB)) {
Changed = visitINSviGPR(MI, AArch64::INSvi16lane);		Changed = visitINSviGPR(MI, AArch64::INSvi16lane);
break;		break;
case AArch64::INSvi8gpr:		case AArch64::INSvi8gpr:
Changed = visitINSviGPR(MI, AArch64::INSvi8lane);		Changed = visitINSviGPR(MI, AArch64::INSvi8lane);
break;		break;
case AArch64::INSvi64lane:		case AArch64::INSvi64lane:
Changed = visitINSvi64lane(MI);		Changed = visitINSvi64lane(MI);
break;		break;
		case AArch64::SMLSLv4i16_v4i32:
		Changed = visitSMLSLv4i16_v4i32(MI);
		break;
}		}
}		}
}		}

return Changed;		return Changed;
}		}

FunctionPass *llvm::createAArch64MIPeepholeOptPass() {		FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
return new AArch64MIPeepholeOpt();		return new AArch64MIPeepholeOpt();
}		}

llvm/test/CodeGen/AArch64/aarch64-smull.ll

	Show First 20 Lines • Show All 1,109 Lines • ▼ Show 20 Lines
	entry:			entry:
	%in1 = zext <4 x i32> %src1 to <4 x i64>			%in1 = zext <4 x i32> %src1 to <4 x i64>
	%in2 = and i64 %src2, 255			%in2 = and i64 %src2, 255
	%broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0			%broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0
	%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer			%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
	%out = mul nsw <4 x i64> %in1, %broadcast.splat			%out = mul nsw <4 x i64> %in1, %broadcast.splat
	ret <4 x i64> %out			ret <4 x i64> %out
	}			}

				define void @smlsl_smlsl2(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
				; CHECK-LABEL: smlsl_smlsl2:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: ldp q2, q3, [x1]
				; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
				; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h
				; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
				; CHECK-NEXT: str q1, [x0]
				; CHECK-NEXT: ret
				entry:
				%5 = load <4 x i32>, ptr %3, align 4
				%6 = trunc <4 x i32> %5 to <4 x i16>
				%7 = getelementptr inbounds i32, ptr %3, i64 4
				%8 = load <4 x i32>, ptr %7, align 4
				%9 = trunc <4 x i32> %8 to <4 x i16>
				%10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6)
				%12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
				%13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9)
				%14 = add <4 x i32> %11, %13
				%15 = sub <4 x i32> %1, %14
				store <4 x i32> %15, ptr %2, align 16
				ret void
				}

				declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 523775

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

llvm/test/CodeGen/AArch64/aarch64-smull.ll

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2
ClosedPublic