This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
13/26
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
aarch64-smull.ll
-
zext-to-tbl.ll

Differential D150969

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2
ClosedPublic

Authored by jaykang10 on May 19 2023, 7:51 AM.

Download Raw Diff

Details

Reviewers

dmgreen
efriedma
t.p.northover

Commits

rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1.

Summary

gcc generates less instructions than llvm from below intrinsic example.

#include <arm_neon.h>

void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_s16(acc, vget_high_s16(a), vget_high_s16(b));
    *out = acc;
}

GCC output

foo:
        ldp     q2, q3, [x1]
        uzp1    v2.8h, v2.8h, v3.8h
        smlsl   v1.4s, v0.4h, v2.4h
        smlsl2  v1.4s, v0.8h, v2.8h
        str     q1, [x0]
        ret

LLVM output

ldp     q2, q3, [x1]
ext     v4.16b, v0.16b, v0.16b, #8
xtn     v2.4h, v2.4s
smlsl   v1.4s, v0.4h, v2.4h
xtn     v0.4h, v3.4s
smlsl   v1.4s, v4.4h, v0.4h
str     q1, [x0]
ret

It looks gcc keeps the intrinsic function calls with builtin function calls.
For example, the vmonv and vcombine intrinsic function calls are matched to the uzp1 pattern as below.

_4 = __builtin_aarch64_xtnv4si (_3);(insn 9 8 10 (set (reg:V4SI 107)
_6 = __builtin_aarch64_xtnv4si (_5);(insn 12 11 13 (set (reg:V4SI 109)
_7 = {_4, _6};
...
(insn 10 9 11 (set (reg:V8HI 108)
        (vec_concat:V8HI (truncate:V4HI (reg:V4SI 107))
            (const_vector:V4HI [
                    (const_int 0 [0]) repeated x4
                ])))
     (nil))
(insn 11 10 0 (set (reg:V4HI 93 [ _5 ])
        (subreg:V4HI (reg:V8HI 108) 0))
     (nil))
(insn 13 12 14 (set (reg:V8HI 110)
        (vec_concat:V8HI (truncate:V4HI (reg:V4SI 109))
            (const_vector:V4HI [
                    (const_int 0 [0]) repeated x4
                ])))
     (nil))
(insn 14 13 0 (set (reg:V4HI 95 [ _7 ])
        (subreg:V4HI (reg:V8HI 110) 0))
     (nil))
(insn 15 14 16 (set (reg:V8HI 111)
        (vec_concat:V8HI (reg:V4HI 93 [ _5 ])
            (reg:V4HI 95 [ _7 ])))
     (nil))
...
(define_insn "*aarch64_narrow_trunc<mode>"
  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
        (vec_concat:<VNARROWQ2>
          (truncate:<VNARROWQ>
            (match_operand:VQN 1 "register_operand" "w"))
          (truncate:<VNARROWQ>
            (match_operand:VQN 2 "register_operand" "w"))))]
  "TARGET_SIMD"
{
  if (!BYTES_BIG_ENDIAN)
    return "uzp1\\t%0.<V2ntype>, %1.<V2ntype>, %2.<V2ntype>";
  else
    return "uzp1\\t%0.<V2ntype>, %2.<V2ntype>, %1.<V2ntype>";
}
  [(set_attr "type" "neon_permute<q>")]
)

It looks clang generates some intrinsic functions' deifintion. After inlining, some intrinsic function calls are optimized away as below.

define dso_local void @foo(<8 x i16> noundef %a, <4 x i32> noundef %acc, ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
entry:
  %0 = load <4 x i32>, ptr %p, align 4
  %vmovn.i = trunc <4 x i32> %0 to <4 x i16>
  %arrayidx2 = getelementptr inbounds i32, ptr %p, i64 4
  %1 = load <4 x i32>, ptr %arrayidx2, align 4
  %vmovn.i17 = trunc <4 x i32> %1 to <4 x i16>
  %shuffle.i18 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i18, <4 x i16> %vmovn.i)
  %shuffle.i19 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %vmull2.i.i20 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i19, <4 x i16> %vmovn.i17)
  %2 = add <4 x i32> %vmull2.i.i, %vmull2.i.i20
  %sub.i21 = sub <4 x i32> %acc, %2
  store <4 x i32> %sub.i21, ptr %out, align 16, !tbaa !6
  ret void
}

For uzp1 instruction, it is hard to match existing pattern for uzp1 without concat_vectors which comes from vcombine_s16.
If clang does not generate the intrinsic function's definition and backend lowers the intrinsic function call, we could see similar code with gcc. However, I do not think it is good way. It could be better to generate the intrinsic function's definition and optimize the code after inlining.

Alternatively, I have tried to check the MIR code sequence with smlsl in AArch64MIPeepholeOpt pass. With this patch, the llvm output is as below.

foo:
        ldp     q2, q3, [x1]
        uzp1    v2.8h, v2.8h, v3.8h
        smlsl   v1.4s, v0.4h, v2.4h
        smlsl2  v1.4s, v0.8h, v2.8h
        str     q1, [x0]
        ret

Diff Detail

Event Timeline

jaykang10 created this revision.May 19 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 19 2023, 7:51 AM

Herald added subscribers: jeroen.dobbelaere, hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.May 19 2023, 7:51 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 19 2023, 7:51 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B233179: Diff 523775.May 19 2023, 7:52 AM

Consider the following:

#include <arm_neon.h>

void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

void foo2(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vuzp1q_s16(vreinterpretq_s16_s32(vld1q_s32(&p[0])),
                             vreinterpretq_s16_s32(vld1q_s32(&p[4])));
    acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

void foo3(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    acc = vmlsl_s16(acc, vget_low_s16(a), vmovn_s32(vld1q_s32(&p[0])));
    acc = vmlsl_s16(acc, vget_high_s16(a), vmovn_s32(vld1q_s32(&p[4])));
    *out = acc;
}

foo() is your original testcase; foo2() is modified to use intrinsics that more closely match the expected sequence, foo3 is modified to get rid of the redundant vcombine/vget pair. clang and gcc generate essentially the same code for foo2() and foo3(); somehow the way foo() is written tickles some combine in gcc that makes it treat it like foo2 instead of foo3.

It looks like your patch fixes the code for both foo2 and foo3; is that right?

Can we generalize this to optimize the following? Maybe split the transform into two steps: one to optimize the following, then one to optimize any remaining extra instructions?

void foo4(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

Can we generalize this to handle other widening instructions that use the high half of the inputs?

Any thoughts on a DAGCombine vs. MIPeepholeOpt?

@efriedma Thanks for your kind comment.

foo() is your original testcase; foo2() is modified to use intrinsics that more closely match the expected sequence, foo3 is modified to get rid of the redundant vcombine/vget pair. clang and gcc generate essentially the same code for foo2() and foo3(); somehow the way foo() is written tickles some combine in gcc that makes it treat it like foo2 instead of foo3.

Yep, I agree with you.
I have already told the team it would be good to use the vuzp1q_s16 intrinsic directly for the example foo than expecting optimization from compiler... but the team wants llvm to support the example like gcc as well as using the vuzp1q_s16...

It looks like your patch fixes the code for both foo2 and foo3; is that right?

The patch was to fix the foo but it looks the foo3 is also affected by this patch because it generates the mir sequence xtn + xtn + smlsl + smlsl.

Can we generalize this to optimize the following? Maybe split the transform into two steps: one to optimize the following, then one to optimize any remaining extra instructions?
void foo4(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
    int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                               vmovn_s32(vld1q_s32(&p[4])));
    acc = vmlsl_high_s16(acc, a, b);
    *out = acc;
}

um... the LLVM IR snippet before/after inlining output is as below.

Before inlining
define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr noundef %2, ptr noundef %3) #0 {
  %5 = load <4 x i32>, ptr %3, align 4
  %6 = call <4 x i16> @vmovn_s32(<4 x i32> noundef %5)
  %7 = getelementptr inbounds i32, ptr %3, i64 4 
  %8 = load <4 x i32>, ptr %7, align 4
  %9 = call <4 x i16> @vmovn_s32(<4 x i32> noundef %8)
  %10 = call <8 x i16> @vcombine_s16(<4 x i16> noundef %6, <4 x i16> noundef %9)
  %11 = call <4 x i32> @vmlsl_high_s16(<4 x i32> noundef %1, <8 x i16> noundef %0, <8 x i16> noundef %10)
  store <4 x i32> %11, ptr %2, align 16, !tbaa !6
  ret void
}

After inlining
define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr noundef %2, ptr noundef %3) local_unnamed_addr #0 {
  %5 = load <4 x i32>, ptr %3, align 4
  %6 = trunc <4 x i32> %5 to <4 x i16>
  %7 = getelementptr inbounds i32, ptr %3, i64 4 
  %8 = load <4 x i32>, ptr %7, align 4
  %9 = trunc <4 x i32> %8 to <4 x i16>
  %10 = shufflevector <4 x i16> %6, <4 x i16> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %11 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %12 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %11, <4 x i16> %9)
  %13 = sub <4 x i32> %1, %12
  store <4 x i32> %13, ptr %2, align 16, !tbaa !6
  ret void
}

As you can see, after inlining, the %10 = shufflevector is redundant so it is removed as below in the end.

define dso_local void @foo4(<8 x i16> noundef %0, <4 x i32> noundef %1, ptr nocapture noundef writeonly %2, ptr nocapture noun
def readonly %3) local_unnamed_addr #0 {
  %5 = getelementptr inbounds i32, ptr %3, i64 4
  %6 = load <4 x i32>, ptr %5, align 4
  %7 = trunc <4 x i32> %6 to <4 x i16>
  %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
  %10 = sub <4 x i32> %1, %9
  store <4 x i32> %10, ptr %2, align 16, !tbaa !6
  ret void
}

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

Legalized selection DAG: %bb.0 'foo4:entry'
SelectionDAG has 20 nodes:
  t0: ch,glue = EntryToken
      t8: i64,ch = CopyFromReg t0, Register:i64 %3
    t10: i64 = add nuw t8, Constant:i64<16>
  t13: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t10, undef:i64
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
          t17: v4i16 = extract_subvector t2, Constant:i64<4>
          t14: v4i16 = truncate t13
        t24: v4i32 = AArch64ISD::SMULL t17, t14
      t21: v4i32 = sub t4, t24
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t22: ch = store<(store (s128) into %ir.out, !tbaa !6)> t13:1, t21, t6, undef:i64
  t23: ch = AArch64ISD::RET_GLUE t22

Can we generalize this to handle other widening instructions that use the high half of the inputs?

I think so.
The main issue is to generate uzp1. The smlsl is like a target node to detect the code sequence for uzp1 so I think we could cover similar cases more.

Any thoughts on a DAGCombine vs. MIPeepholeOpt?

The foo's legalized DAG is as below.

Legalized selection DAG: %bb.0 'foo:entry'
SelectionDAG has 27 nodes:
  t0: ch,glue = EntryToken
  t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
  t8: i64,ch = CopyFromReg t0, Register:i64 %3
  t11: v4i32,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
    t14: i64 = add nuw t8, Constant:i64<16>
  t15: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t14, undef:i64
      t27: ch = TokenFactor t11:1, t15:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t18: v4i16 = extract_subvector t2, Constant:i64<0>
            t12: v4i16 = truncate t11
          t31: v4i32 = AArch64ISD::SMULL t18, t12
            t23: v4i16 = extract_subvector t2, Constant:i64<4>
            t16: v4i16 = truncate t15
          t30: v4i32 = AArch64ISD::SMULL t23, t16
        t25: v4i32 = add t31, t30
      t26: v4i32 = sub t4, t25
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t28: ch = store<(store (s128) into %ir.out, !tbaa !6)> t27, t26, t6, undef:i64
  t29: ch = AArch64ISD::RET_GLUE t28

With t25: v4i32 = add t31, t30, we could do dagcombine as below because we do not generate custom node for smlsl2 in DAG level. I think it is also not simple...

t0: ch,glue = EntryToken
t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
t8: i64,ch = CopyFromReg t0, Register:i64 %3
t11: v8i16,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
  t13: i64 = add nuw t8, Constant:i64<16>
t14: v8i16,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t13, undef:i64
t34: v8i16 = AArch64ISD::UZP1 t11, t14
    t28: ch = TokenFactor t11:1, t14:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
          t17: v4i16 = extract_subvector t2, Constant:i64<0>
          t19: v4i16 = extract_subvector t34, Constant:i64<0>
        t32: v4i32 = AArch64ISD::SMULL t17, t19
      t35: v4i32 = sub t4, t32
        t23: v4i16 = extract_subvector t2, Constant:i64<4>
        t24: v4i16 = extract_subvector t34, Constant:i64<4>
      t31: v4i32 = AArch64ISD::SMULL t23, t24
    t36: v4i32 = sub t35, t31
    t6: i64,ch = CopyFromReg t0, Register:i64 %2
  t29: ch = store<(store (s128) into %ir.out, !tbaa !6)> t28, t36, t6, undef:i64
t30: ch = AArch64ISD::RET_GLUE t29

With MIPeepholeOpt, it could be a bit simpler to add the other widening instructions that use the high half of the inputs... but I am not sure which one is better...

jaykang10 updated this revision to Diff 525567.May 25 2023, 5:58 AM

Harbormaster completed remote builds in B234483: Diff 525567.May 25 2023, 5:58 AM

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

We have something like "smull(trunc(x), extract_high(y))". That should be equivalent to "smull2(uzp1(undef,x), y)", I think?

In D150969#4373399, @efriedma wrote:

From my personal opinion, I think it is hard to generate uzp1 from above LLVM IR snippet. The legalized DAG is as below.

We have something like "smull(trunc(x), extract_high(y))". That should be equivalent to "smull2(uzp1(undef,x), y)", I think?

Ah, ok.
I missed the undef. Let me try to use it.

Following @efriedma's comment, updated patch with DAGCombine

Harbormaster completed remote builds in B235521: Diff 526966.May 31 2023, 3:22 AM

Sounds good. Is the idea to expand this to check the uses of the original EXTRACT_SUBVECTOR's operands to see if there is another long mul that can use the other operand of the uzip?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22417	Can this use maybe isSplatValue, to avoid the call to LowerOperation.
22426	It looks like LHS/RHS could be a bitcast from isEssentiallyExtractHighSubvector.

In D150969#4383791, @dmgreen wrote:

Sounds good. Is the idea to expand this to check the uses of the original EXTRACT_SUBVECTOR's operands to see if there is another long mul that can use the other operand of the uzip?

Yep, as @efriedma suggested, first, we handle the mul high with uzp1 and then we will try to handle other instructions based on the uzp1.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22417	Yep, you are right. Let me update it.
22426	Yep, you are right. Let me update it.

jaykang10 updated this revision to Diff 527022.May 31 2023, 7:01 AM

Harbormaster completed remote builds in B235561: Diff 527022.May 31 2023, 7:53 AM

Based on the uzp1 from dagcombine, if uzp1 has IMPLICIT_DEF as low 64-bit operand, we can replace it with xtn's operand.
For example,

%4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4)
%5:fpr64 = XTNv4i16 killed %4:fpr128
%6:fpr64 = COPY %0.dsub:fpr128
%7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4)
%9:fpr128 = IMPLICIT_DEF
%8:fpr128 = UZP1v8i16 killed %9:fpr128, killed %7:fpr128
%10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %5:fpr64
%11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128
==>
%4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4)
%6:fpr64 = COPY %0.dsub:fpr128
%7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4)
%8:fpr128 = UZP1v8i16 killed %4:fpr128, killed %7:fpr128 
%12:fpr64 = COPY %8.dsub:fpr128
%10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %12:fpr64 
%11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128

If you are ok with these approach, it could be good to split this patch into two patches which are dagcombine one and MIPeephole one.

Harbormaster completed remote builds in B235842: Diff 527416.Jun 1 2023, 8:45 AM

dmgreen added inline comments.Jun 5 2023, 1:04 AM

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
727 ↗	(On Diff #527416)	This (and maybe the distance checks below) would make the algorithm O(N^2) in the number of instructions in the block. It does allow the algorithm to be quite general - it can match any truncate with the UZP getting a free truncate for what may be an unrelated instruction. It may not always be valid though - Could the truncate depend on result of the UZP or vice-versa? It does have the advantage that it works with either SDAG or GlobalISel though. From what I have seen mull's often come in pairs. For example the code in smlsl_smlsl2_v4i32_uzp1 has: ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h If it was processing the smlsl2, it might be able to look at the extract high of the first operand, see that it has 2 uses with the other being an smull(extractlow(.)), and use the other operand of the smull in the UZP instead of the undef when creating it in DAG? It has to check a number of things (and doesn't help with globalisel), but hopefully fits in as an extension to the existing code in SDAG.

jaykang10 added inline comments.Jun 5 2023, 2:06 AM

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

727 ↗

(On Diff #527416)

um... as you can see on my first patch in this review, I checked the smlsl2's first operand's is smlsl in MIPeephole opt.
@efriedma pointed out the patch fixes the specific case and suggested to generalize the case. In order to generalize case, he suggested to split the issue into two cases so I tried to fix them in dagcombine and MIPeephole opt.
At this moment, AArch64 target does not have smlsl smlsl2 custom node in DAG so we need to detect the node patterns such as sub, add, SMULL and extract_subvector from below dag.

Legalized selection DAG: %bb.0 'foo:entry'
SelectionDAG has 27 nodes:
  t0: ch,glue = EntryToken
  t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
  t8: i64,ch = CopyFromReg t0, Register:i64 %3
  t11: v4i32,ch = load<(load (s128) from %ir.p, align 4)> t0, t8, undef:i64
    t14: i64 = add nuw t8, Constant:i64<16>
  t15: v4i32,ch = load<(load (s128) from %ir.arrayidx2, align 4)> t0, t14, undef:i64
      t27: ch = TokenFactor t11:1, t15:1
        t4: v4i32,ch = CopyFromReg t0, Register:v4i32 %1
            t18: v4i16 = extract_subvector t2, Constant:i64<0>
            t12: v4i16 = truncate t11
          t31: v4i32 = AArch64ISD::SMULL t18, t12
            t23: v4i16 = extract_subvector t2, Constant:i64<4>
            t16: v4i16 = truncate t15
          t30: v4i32 = AArch64ISD::SMULL t23, t16
        t25: v4i32 = add t31, t30
      t26: v4i32 = sub t4, t25
      t6: i64,ch = CopyFromReg t0, Register:i64 %2
    t28: ch = store<(store (s128) into %ir.out, !tbaa !6)> t27, t26, t6, undef:i64
  t29: ch = AArch64ISD::RET_GLUE t28

I am not sure which approach is better. @efriedma How do you think about it?
Anyway, let me try to implement it in dagcombine.

Following @dmgreen's comment, the code is implemented in dagcombine.

Harbormaster completed remote builds in B237219: Diff 529235.Jun 7 2023, 4:22 AM

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.
That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.
The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.
The smull/umull/pmull's other operand in a trunc.
We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

You can see the case with sub and add from pmlsl_pmlsl2_v8i16_uzp1 in this patch.

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

Anyway, I do not know your idea will work with the cases well or not before implementing it.
If your idea is acceptable, I do not mind the complexity and try to implement it because you and @ktkachov want to solve this issue.

In D150969#4405308, @jaykang10 wrote:

In D150969#4405196, @dmgreen wrote:

I don't think the add and sub are necessarily important. The same pattern can apply to any smull/umull/pmull. https://godbolt.org/z/KfrYxcvYq. I guess you are using them as a way of finding the 'other' mull instruction?

I was hoping that the previous code with tryCombineOpWithUZP1 being called from performMULLCombine could just be expanded. After it had recognized and created the new UPZ1 (as that should be beneficial on its own), it could look for the other smull/umull/pmull in the pair. I think using the old code it would need to check:

That the LHS/RHS that is not TRUNC (I am going to call this OtherOp) is an extract_subvector high.

That OtherOp.operand(0) should have 2 uses, one of which is OtherOp.

The other use is another EXTRACT_SUBVECTOR that has a single use which is a smull/umull/pmull.

The smull/umull/pmull's other operand in a trunc.

We then use that trunc in the UZP1, using DAG.ReplaceAllUsesWith to replace the other smull/umull/pmull with a new version using the EXTRACT_SUBVECTOR low of UZP1.

Do you think that would handle the cases you have seen, or is it all too complex?

Anyway, I do not know your idea will work with the cases well or not before implementing it.
If your idea is acceptable, I do not mind the complexity and try to implement it because you and @ktkachov want to solve this issue.

FWIW I think it makes sense to solve this generally so it may be worth investing some time to make it work

If you wanted to go back to this version with tryCombineOpWithUZP1: https://reviews.llvm.org/D150969?id=527022, and get that in and committed I think that would make a lot of sense. It should be beneficial on its own and we can then figure out a way of reusing the uzp1 ontop of it.

In D150969#4405763, @dmgreen wrote:

If you wanted to go back to this version with tryCombineOpWithUZP1: https://reviews.llvm.org/D150969?id=527022, and get that in and committed I think that would make a lot of sense. It should be beneficial on its own and we can then figure out a way of reusing the uzp1 ontop of it.

I am checking and implementing what you wrote.

Following @dmgreen's comment, updated code.

@dmgreen If I misunderstood what you wrote, let me know. I will implement it again.

Harbormaster completed remote builds in B237716: Diff 529900.Jun 9 2023, 5:12 AM

dmgreen added inline comments.Jun 12 2023, 2:12 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22381	Perhaps name this tryCombineMULLWithUZP1, to show it operands on mull nodes.
22437	If these or the conditions below fail, could it still create the UZP with an undef operand? Something like this in pseudocode. SDValue TRUNCLOWOP = DAG.getUndef(VT); if (.. find the other operand through the uses) // This is the complex bit TRUNCLOWOP = Found Other Op; UZP = DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TRUNCLOWOP, TRUNCHIGHOP); ReplaceUse(TRUNCHIGH, UZP EXTRACT_SUBVECTOR Hi). if (previouslyFoundOtherOp) ReplaceUse(TRUNCLOW, UZP EXTRACT_SUBVECTOR Lo). I believe that should then handle some of the other cases like efriedma mentioned.
22451	This could use isNullConstant
22492	The LLVM naming scheme would be TruncHighVT I think.

jaykang10 added inline comments.Jun 12 2023, 2:24 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22381	Let me update the name.
22437	Ah, I did not know you wanted to generate the uzp1 with undef. Let me update the code.
22451	Let me update it.
22492	Let me update the name.

Following @dmgreen's comment, updated code.

Harbormaster completed remote builds in B238144: Diff 530446.Jun 12 2023, 3:41 AM

Thanks this looks good I think.

Can you add a tests case where there is only an extract high created?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22442–22443	`for (SDNode *User : ExtractHighSrcVec.getNode()->uses())`
22464–22465	`SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();`
22469	This can be `ExtractLowUser->getOperand(0) == ExtractLow` I think?
22503	Should TruncHighVT be UZP1VT, as we know the type of the UPZ1? It may be better to recreate the constant with the correct value for UZP1VT, to make sure with the bitcast we don't get it wrong.

In D150969#4413629, @dmgreen wrote:

Thanks this looks good I think.

Can you add a tests case where there is only an extract high created?

Let me add the tests.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22442–22443	Let me update it.
22464–22465	Let me update it.
22469	Let me update it.
22503	Let me update it.

@dmgreen If you want to change something more, let me know.
Let me update the code.

Harbormaster completed remote builds in B238173: Diff 530484.Jun 12 2023, 6:17 AM

Thanks

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22446–22451	These two ifs could be combined into one
22460	I think this one could leave HasFoundMULLow = true but without a valid TruncLow.

jaykang10 added inline comments.Jun 13 2023, 2:08 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22446–22451	Let me update it.
22460	Let me update it.

ktkachov removed a subscriber: ktkachov.Jun 13 2023, 2:10 AM

Following @dmgreen's comments, updated code.

Harbormaster completed remote builds in B238432: Diff 530826.Jun 13 2023, 2:14 AM

dmgreen added inline comments.Jun 13 2023, 2:24 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22460	I meant - if we get to the start of this if with HasFoundMULLow=true (so ExtractLow is valid), but it doesn't have 1 use, then we get to the code below (`SDValue TruncLowOp = HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);`) with HasFoundMULLow=true but TruncLow not being a valid node. Maybe split the one use check out: if (!ExtractLow->hasOneUse()) HasFoundMULLow = false; // Check ExtractLow's user. if (HasFoundMULLow) {...

jaykang10 added inline comments.Jun 13 2023, 2:31 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
22460	Ah, I understand what you meant now. Let me update it.

Following @dmgreen's comment, updated code.

Harbormaster completed remote builds in B238439: Diff 530838.Jun 13 2023, 2:47 AM

Oh yeah. Thanks. LGTM

This revision is now accepted and ready to land.Jun 13 2023, 5:12 AM

In D150969#4417139, @dmgreen wrote:

Oh yeah. Thanks. LGTM

Thanks for review.
Let me push this patch.

This revision was landed with ongoing or failed builds.Jun 13 2023, 6:35 AM

Closed by commit rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1. (authored by jaykang10). · Explain Why

This revision was automatically updated to reflect the committed changes.

jaykang10 added a commit: rG16daaf0887f9: [AArch64] Try to combine MULL with uzp1..

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

71 lines

test/

CodeGen/

AArch64/

aarch64-smull.ll

165 lines

zext-to-tbl.ll

66 lines

Diff 527022

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 22,371 Lines • ▼ Show 20 Lines	static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
SDValue NewInsert =		SDValue NewInsert =
DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,		DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));		DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,		SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
NewInsert, N->getOperand(1));		NewInsert, N->getOperand(1));
return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);		return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
}		}

		// Try to combine op with uzp1. For example,
		//
		dmgreenUnsubmitted Not Done Reply Inline Actions Perhaps name this tryCombineMULLWithUZP1, to show it operands on mull nodes. dmgreen: Perhaps name this tryCombineMULLWithUZP1, to show it operands on mull nodes.
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update the name. jaykang10: Let me update the name.
		// smull(trunc(x), extract_high(y))
		// ==>
		// smull(extract_high(uzp1(undef,x)), extract_high(y))
		//
		// -> It will be matched to smull2.
		static SDValue tryCombineOpWithUZP1(SDNode *N,
		TargetLowering::DAGCombinerInfo &DCI,
		SelectionDAG &DAG) {
		if (DCI.isBeforeLegalizeOps())
		return SDValue();

		SDValue LHS = N->getOperand(0);
		SDValue RHS = N->getOperand(1);
		EVT VT = N->getValueType(0);

		SDValue TRUNC;
		SDLoc DL(N);

		// Check the operands are trunc and extract_high.
		if (isEssentiallyExtractHighSubvector(LHS) &&
		RHS.getOpcode() == ISD::TRUNCATE)
		TRUNC = RHS;
		else if (isEssentiallyExtractHighSubvector(RHS) &&
		LHS.getOpcode() == ISD::TRUNCATE)
		TRUNC = LHS;
		else
		return SDValue();

		// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
		// with uzp1.
		// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
		SDValue TRUNCOP = TRUNC.getOperand(0);
		EVT TRUNCOPVT = TRUNCOP.getValueType();
		if (TRUNCOP.getOpcode() == AArch64ISD::DUP \|\|
		DAG.isSplatValue(TRUNCOP, false))
		return SDValue();
		dmgreenUnsubmitted Not Done Reply Inline Actions Can this use maybe isSplatValue, to avoid the call to LowerOperation. dmgreen: Can this use maybe isSplatValue, to avoid the call to LowerOperation.
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Yep, you are right. Let me update it. jaykang10: Yep, you are right. Let me update it.

		// Create uzp1 and extract_high.
		EVT TRUNCVT = TRUNC.getValueType();
		EVT UZP1VT = TRUNCVT.getDoubleNumVectorElementsVT(*DAG.getContext());
		SDValue HighIdx =
		DAG.getConstant(TRUNCVT.getVectorNumElements(), DL, MVT::i64);
		if (TRUNCOPVT != UZP1VT)
		TRUNCOP = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TRUNCOP);
		SDValue UZP1 =
		dmgreenUnsubmitted Not Done Reply Inline Actions It looks like LHS/RHS could be a bitcast from isEssentiallyExtractHighSubvector. dmgreen: It looks like LHS/RHS could be a bitcast from isEssentiallyExtractHighSubvector.
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Yep, you are right. Let me update it. jaykang10: Yep, you are right. Let me update it.
		DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, DAG.getUNDEF(UZP1VT), TRUNCOP);
		UZP1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TRUNCVT, UZP1, HighIdx);

		SDValue NewLHS = (TRUNC == RHS) ? LHS : UZP1;
		SDValue NewRHS = (TRUNC == RHS) ? UZP1 : RHS;

		return DAG.getNode(N->getOpcode(), DL, VT, NewLHS, NewRHS);
		}

		static SDValue performMULLCombine(SDNode *N,
		TargetLowering::DAGCombinerInfo &DCI,
		dmgreenUnsubmitted Not Done Reply Inline Actions If these or the conditions below fail, could it still create the UZP with an undef operand? Something like this in pseudocode. SDValue TRUNCLOWOP = DAG.getUndef(VT); if (.. find the other operand through the uses) // This is the complex bit TRUNCLOWOP = Found Other Op; UZP = DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TRUNCLOWOP, TRUNCHIGHOP); ReplaceUse(TRUNCHIGH, UZP EXTRACT_SUBVECTOR Hi). if (previouslyFoundOtherOp) ReplaceUse(TRUNCLOW, UZP EXTRACT_SUBVECTOR Lo). I believe that should then handle some of the other cases like efriedma mentioned. dmgreen: If these or the conditions below fail, could it still create the UZP with an undef operand?
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Ah, I did not know you wanted to generate the uzp1 with undef. Let me update the code. jaykang10: Ah, I did not know you wanted to generate the uzp1 with undef. Let me update the code.
		SelectionDAG &DAG) {
		if (SDValue Val =
		tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG))
		return Val;

		if (SDValue Val = tryCombineOpWithUZP1(N, DCI, DAG))
		dmgreenUnsubmitted Not Done Reply Inline Actions `for (SDNode User : ExtractHighSrcVec.getNode()->uses())` dmgreen:* `for (SDNode *User : ExtractHighSrcVec.getNode()->uses())`
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
		return Val;

		return SDValue();
		}

SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,		SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
		dmgreenUnsubmitted Not Done Reply Inline Actions This could use isNullConstant dmgreen: This could use isNullConstant
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
		dmgreenUnsubmitted Not Done Reply Inline Actions These two ifs could be combined into one dmgreen: These two ifs could be combined into one
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
switch (N->getOpcode()) {		switch (N->getOpcode()) {
default:		default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");		LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;		break;
case ISD::VECREDUCE_AND:		case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:		case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:		case ISD::VECREDUCE_XOR:
return performVecReduceBitwiseCombine(N, DCI, DAG);		return performVecReduceBitwiseCombine(N, DCI, DAG);
case ISD::ADD:		case ISD::ADD:
		dmgreenUnsubmitted Not Done Reply Inline Actions I think this one could leave HasFoundMULLow = true but without a valid TruncLow. dmgreen: I think this one could leave HasFoundMULLow = true but without a valid TruncLow.
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
		dmgreenUnsubmitted Not Done Reply Inline Actions I meant - if we get to the start of this if with HasFoundMULLow=true (so ExtractLow is valid), but it doesn't have 1 use, then we get to the code below (`SDValue TruncLowOp = HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);`) with HasFoundMULLow=true but TruncLow not being a valid node. Maybe split the one use check out: if (!ExtractLow->hasOneUse()) HasFoundMULLow = false; // Check ExtractLow's user. if (HasFoundMULLow) {... dmgreen: I meant - if we get to the start of this if with HasFoundMULLow=true (so ExtractLow is valid)…
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Ah, I understand what you meant now. Let me update it. jaykang10: Ah, I understand what you meant now. Let me update it.
case ISD::SUB:		case ISD::SUB:
return performAddSubCombine(N, DCI);		return performAddSubCombine(N, DCI);
case ISD::BUILD_VECTOR:		case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);		return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:		case ISD::TRUNCATE:
		dmgreenUnsubmitted Not Done Reply Inline Actions `SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();` dmgreen: `SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();`
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
return performTruncateCombine(N, DAG);		return performTruncateCombine(N, DAG);
case AArch64ISD::ANDS:		case AArch64ISD::ANDS:
return performFlagSettingCombine(N, DCI, ISD::AND);		return performFlagSettingCombine(N, DCI, ISD::AND);
case AArch64ISD::ADC:		case AArch64ISD::ADC:
		dmgreenUnsubmitted Not Done Reply Inline Actions This can be `ExtractLowUser->getOperand(0) == ExtractLow` I think? dmgreen: This can be `ExtractLowUser->getOperand(0) == ExtractLow` I think?
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))		if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
return R;		return R;
return foldADCToCINC(N, DAG);		return foldADCToCINC(N, DAG);
case AArch64ISD::SBC:		case AArch64ISD::SBC:
return foldOverflowCheck(N, DAG, /* IsAdd */ false);		return foldOverflowCheck(N, DAG, /* IsAdd */ false);
case AArch64ISD::ADCS:		case AArch64ISD::ADCS:
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))		if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
return R;		return R;
return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);		return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
case AArch64ISD::SBCS:		case AArch64ISD::SBCS:
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))		if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
return R;		return R;
return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);		return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
case ISD::XOR:		case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);		return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:		case ISD::MUL:
return performMulCombine(N, DAG, DCI, Subtarget);		return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:		case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:		case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);		return performIntToFpCombine(N, DAG, Subtarget);
case ISD::FP_TO_SINT:		case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:		case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT_SAT:		case ISD::FP_TO_SINT_SAT:
		dmgreenUnsubmitted Not Done Reply Inline Actions The LLVM naming scheme would be TruncHighVT I think. dmgreen: The LLVM naming scheme would be TruncHighVT I think.
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update the name. jaykang10: Let me update the name.
case ISD::FP_TO_UINT_SAT:		case ISD::FP_TO_UINT_SAT:
return performFpToIntCombine(N, DAG, DCI, Subtarget);		return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:		case ISD::FDIV:
return performFDivCombine(N, DAG, DCI, Subtarget);		return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:		case ISD::OR:
return performORCombine(N, DCI, Subtarget, *this);		return performORCombine(N, DCI, Subtarget, *this);
case ISD::AND:		case ISD::AND:
return performANDCombine(N, DCI);		return performANDCombine(N, DCI);
case ISD::FADD:		case ISD::FADD:
return performFADDCombine(N, DCI);		return performFADDCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:		case ISD::INTRINSIC_WO_CHAIN:
		dmgreenUnsubmitted Not Done Reply Inline Actions Should TruncHighVT be UZP1VT, as we know the type of the UPZ1? It may be better to recreate the constant with the correct value for UZP1VT, to make sure with the bitcast we don't get it wrong. dmgreen: Should TruncHighVT be UZP1VT, as we know the type of the UPZ1? It may be better to recreate the…
		jaykang10AuthorUnsubmitted Done Reply Inline Actions Let me update it. jaykang10: Let me update it.
return performIntrinsicCombine(N, DCI, Subtarget);		return performIntrinsicCombine(N, DCI, Subtarget);
case ISD::ANY_EXTEND:		case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:		case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:		case ISD::SIGN_EXTEND:
return performExtendCombine(N, DCI, DAG);		return performExtendCombine(N, DCI, DAG);
case ISD::SIGN_EXTEND_INREG:		case ISD::SIGN_EXTEND_INREG:
return performSignExtendInRegCombine(N, DCI, DAG);		return performSignExtendInRegCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:		case ISD::CONCAT_VECTORS:
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI, Subtarget);		return performExtractVectorEltCombine(N, DCI, Subtarget);
case ISD::VECREDUCE_ADD:		case ISD::VECREDUCE_ADD:
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);		return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
case AArch64ISD::UADDV:		case AArch64ISD::UADDV:
return performUADDVCombine(N, DAG);		return performUADDVCombine(N, DAG);
case AArch64ISD::SMULL:		case AArch64ISD::SMULL:
case AArch64ISD::UMULL:		case AArch64ISD::UMULL:
case AArch64ISD::PMULL:		case AArch64ISD::PMULL:
return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);		return performMULLCombine(N, DCI, DAG);
case ISD::INTRINSIC_VOID:		case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:		case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {		switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:		case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /=ScalarSizeInBytes/);		return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /=ScalarSizeInBytes/);
case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:		case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /=ScalarSizeInBytes/);		return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /=ScalarSizeInBytes/);
case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:		case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
▲ Show 20 Lines • Show All 2,853 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-smull.ll

Show First 20 Lines • Show All 1,027 Lines • ▼ Show 20 Lines	entry:
%out = mul nsw <4 x i32> %in1, %in2		%out = mul nsw <4 x i32> %in1, %in2
ret <4 x i32> %out		ret <4 x i32> %out
}		}

define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {		define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
; CHECK-LABEL: umull_and_v8i32:		; CHECK-LABEL: umull_and_v8i32:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff		; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v2.16b, v2.16b, v3.16b		; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b		; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
		; CHECK-NEXT: uzp1 v3.8h, v0.8h, v2.8h
; CHECK-NEXT: xtn v1.4h, v1.4s		; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v2.4h, v2.4s		; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h		; CHECK-NEXT: umull2 v1.4s, v0.8h, v3.8h
; CHECK-NEXT: umull v1.4s, v4.4h, v2.4h		; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%in1 = zext <8 x i16> %src1 to <8 x i32>		%in1 = zext <8 x i16> %src1 to <8 x i32>
%in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>		%in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%out = mul nsw <8 x i32> %in1, %in2		%out = mul nsw <8 x i32> %in1, %in2
ret <8 x i32> %out		ret <8 x i32> %out
}		}

Show All 28 Lines	entry:
%out = mul nsw <2 x i64> %in1, %in2		%out = mul nsw <2 x i64> %in1, %in2
ret <2 x i64> %out		ret <2 x i64> %out
}		}

define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {		define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
; CHECK-LABEL: umull_and_v4i64:		; CHECK-LABEL: umull_and_v4i64:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v3.2d, #0x000000000000ff		; CHECK-NEXT: movi v3.2d, #0x000000000000ff
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v2.16b, v2.16b, v3.16b		; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b		; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
		; CHECK-NEXT: uzp1 v3.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v1.2s, v1.2d		; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: xtn v2.2s, v2.2d		; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s		; CHECK-NEXT: umull2 v1.2d, v0.4s, v3.4s
; CHECK-NEXT: umull v1.2d, v4.2s, v2.2s		; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>		%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>		%in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
%out = mul nsw <4 x i64> %in1, %in2		%out = mul nsw <4 x i64> %in1, %in2
ret <4 x i64> %out		ret <4 x i64> %out
}		}

define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {		define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
; CHECK-LABEL: umull_and_v4i64_dup:		; CHECK-LABEL: umull_and_v4i64_dup:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and w8, w0, #0xff		; CHECK-NEXT: and w8, w0, #0xff
; CHECK-NEXT: dup v2.4s, w8		; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s		; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s		; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>		%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and i64 %src2, 255		%in2 = and i64 %src2, 255
%broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0		%broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0
%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer		%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
%out = mul nsw <4 x i64> %in1, %broadcast.splat		%out = mul nsw <4 x i64> %in1, %broadcast.splat
ret <4 x i64> %out		ret <4 x i64> %out
}		}

		define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
		; CHECK-LABEL: pmlsl2_v8i16_uzp1:
		; CHECK: // %bb.0:
		; CHECK-NEXT: ldr q2, [x1, #16]
		; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
		; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
		; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
		; CHECK-NEXT: str q0, [x0]
		; CHECK-NEXT: ret
		%5 = getelementptr inbounds i32, ptr %3, i64 4
		%6 = load <8 x i16>, ptr %5, align 4
		%7 = trunc <8 x i16> %6 to <8 x i8>
		%8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
		%9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7)
		%10 = sub <8 x i16> %1, %9
		store <8 x i16> %10, ptr %2, align 16
		ret void
		}

		define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
		; CHECK-LABEL: smlsl2_v8i16_uzp1:
		; CHECK: // %bb.0:
		; CHECK-NEXT: ldr q2, [x1, #16]
		; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
		; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		%5 = getelementptr inbounds i32, ptr %3, i64 4
		%6 = load <8 x i16>, ptr %5, align 4
		%7 = trunc <8 x i16> %6 to <8 x i8>
		%8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
		%9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7)
		%10 = sub <8 x i16> %1, %9
		store <8 x i16> %10, ptr %2, align 16
		ret void
		}

		define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
		; CHECK-LABEL: umlsl2_v8i16_uzp1:
		; CHECK: // %bb.0:
		; CHECK-NEXT: ldr q2, [x1, #16]
		; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
		; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		%5 = getelementptr inbounds i32, ptr %3, i64 4
		%6 = load <8 x i16>, ptr %5, align 4
		%7 = trunc <8 x i16> %6 to <8 x i8>
		%8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
		%9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7)
		%10 = sub <8 x i16> %1, %9
		store <8 x i16> %10, ptr %2, align 16
		ret void
		}

		define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
		; CHECK-LABEL: smlsl2_v4i32_uzp1:
		; CHECK: // %bb.0:
		; CHECK-NEXT: ldr q2, [x1, #16]
		; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
		; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		%5 = getelementptr inbounds i32, ptr %3, i64 4
		%6 = load <4 x i32>, ptr %5, align 4
		%7 = trunc <4 x i32> %6 to <4 x i16>
		%8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		%9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
		%10 = sub <4 x i32> %1, %9
		store <4 x i32> %10, ptr %2, align 16
		ret void
		}

		define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
		; CHECK-LABEL: umlsl2_v4i32_uzp1:
		; CHECK: // %bb.0:
		; CHECK-NEXT: ldr q2, [x1, #16]
		; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
		; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		%5 = getelementptr inbounds i32, ptr %3, i64 4
		%6 = load <4 x i32>, ptr %5, align 4
		%7 = trunc <4 x i32> %6 to <4 x i16>
		%8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		%9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7)
		%10 = sub <4 x i32> %1, %9
		store <4 x i32> %10, ptr %2, align 16
		ret void
		}

		define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
		; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: ldp q2, q3, [x1]
		; CHECK-NEXT: xtn v2.4h, v2.4s
		; CHECK-NEXT: uzp1 v3.8h, v0.8h, v3.8h
		; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h
		; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v3.8h
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		entry:
		%5 = load <4 x i32>, ptr %3, align 4
		%6 = trunc <4 x i32> %5 to <4 x i16>
		%7 = getelementptr inbounds i32, ptr %3, i64 4
		%8 = load <4 x i32>, ptr %7, align 4
		%9 = trunc <4 x i32> %8 to <4 x i16>
		%10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		%11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6)
		%12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		%13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9)
		%14 = add <4 x i32> %11, %13
		%15 = sub <4 x i32> %1, %14
		store <4 x i32> %15, ptr %2, align 16
		ret void
		}

		define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
		; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1:
		; CHECK: // %bb.0: // %entry
		; CHECK-NEXT: ldp q2, q3, [x1]
		; CHECK-NEXT: xtn v2.4h, v2.4s
		; CHECK-NEXT: uzp1 v3.8h, v0.8h, v3.8h
		; CHECK-NEXT: umlsl v1.4s, v0.4h, v2.4h
		; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v3.8h
		; CHECK-NEXT: str q1, [x0]
		; CHECK-NEXT: ret
		entry:
		%5 = load <4 x i32>, ptr %3, align 4
		%6 = trunc <4 x i32> %5 to <4 x i16>
		%7 = getelementptr inbounds i32, ptr %3, i64 4
		%8 = load <4 x i32>, ptr %7, align 4
		%9 = trunc <4 x i32> %8 to <4 x i16>
		%10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		%11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6)
		%12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		%13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9)
		%14 = add <4 x i32> %11, %13
		%15 = sub <4 x i32> %1, %14
		store <4 x i32> %15, ptr %2, align 16
		ret void
		}

		declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
		declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
		declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
		declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
		declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)

llvm/test/CodeGen/AArch64/zext-to-tbl.ll

	Show First 20 Lines • Show All 2,871 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: Lloh51:			; CHECK-NEXT: Lloh51:
	; CHECK-NEXT: ldr q1, [x9, lCPI24_1@PAGEOFF]			; CHECK-NEXT: ldr q1, [x9, lCPI24_1@PAGEOFF]
	; CHECK-NEXT: Lloh52:			; CHECK-NEXT: Lloh52:
	; CHECK-NEXT: ldr q2, [x10, lCPI24_2@PAGEOFF]			; CHECK-NEXT: ldr q2, [x10, lCPI24_2@PAGEOFF]
	; CHECK-NEXT: Lloh53:			; CHECK-NEXT: Lloh53:
	; CHECK-NEXT: ldr q3, [x11, lCPI24_3@PAGEOFF]			; CHECK-NEXT: ldr q3, [x11, lCPI24_3@PAGEOFF]
	; CHECK-NEXT: LBB24_1: ; %loop			; CHECK-NEXT: LBB24_1: ; %loop
	; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1			; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: ldr q5, [x1], #16			; CHECK-NEXT: ldr q4, [x1], #16
	; CHECK-NEXT: ldr q4, [x8, #16]!			; CHECK-NEXT: ldr q5, [x8, #16]!
	; CHECK-NEXT: ldr q6, [x0]			; CHECK-NEXT: ldr q17, [x0]
	; CHECK-NEXT: subs w2, w2, #1			; CHECK-NEXT: subs w2, w2, #1
	; CHECK-NEXT: tbl.16b v16, { v5 }, v0			; CHECK-NEXT: tbl.16b v16, { v4 }, v1
	; CHECK-NEXT: tbl.16b v17, { v5 }, v1			; CHECK-NEXT: tbl.16b v6, { v4 }, v3
	; CHECK-NEXT: tbl.16b v18, { v5 }, v2			; CHECK-NEXT: tbl.16b v7, { v4 }, v2
	; CHECK-NEXT: ext.16b v7, v4, v4, #8			; CHECK-NEXT: tbl.16b v4, { v4 }, v0
	; CHECK-NEXT: tbl.16b v5, { v5 }, v3			; CHECK-NEXT: uzp1.8h v16, v0, v16
	; CHECK-NEXT: xtn.4h v16, v16			; CHECK-NEXT: uzp1.8h v6, v0, v6
	; CHECK-NEXT: xtn.4h v17, v17			; CHECK-NEXT: xtn.4h v7, v7
	; CHECK-NEXT: xtn.4h v18, v18			; CHECK-NEXT: xtn.4h v4, v4
	; CHECK-NEXT: ext.16b v19, v6, v6, #8			; CHECK-NEXT: umull.4s v7, v17, v7
	; CHECK-NEXT: umull.4s v4, v4, v16			; CHECK-NEXT: umull.4s v4, v5, v4
	; CHECK-NEXT: umull.4s v7, v7, v17			; CHECK-NEXT: umull2.4s v5, v5, v16
	; CHECK-NEXT: umull.4s v6, v6, v18			; CHECK-NEXT: umull2.4s v6, v17, v6
	; CHECK-NEXT: xtn.4h v5, v5			; CHECK-NEXT: str q7, [x0]
	; CHECK-NEXT: stp q4, q7, [x0, #32]			; CHECK-NEXT: stp q4, q5, [x0, #32]
	; CHECK-NEXT: umull.4s v4, v19, v5
	; CHECK-NEXT: str q6, [x0]
	; CHECK-NEXT: mov x0, x8			; CHECK-NEXT: mov x0, x8
	; CHECK-NEXT: str q4, [x8]			; CHECK-NEXT: str q6, [x8]
	; CHECK-NEXT: b.ne LBB24_1			; CHECK-NEXT: b.ne LBB24_1
	; CHECK-NEXT: ; %bb.2: ; %exit			; CHECK-NEXT: ; %bb.2: ; %exit
	; CHECK-NEXT: mov w0, wzr			; CHECK-NEXT: mov w0, wzr
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	; CHECK-NEXT: .loh AdrpLdr Lloh49, Lloh53			; CHECK-NEXT: .loh AdrpLdr Lloh49, Lloh53
	; CHECK-NEXT: .loh AdrpLdr Lloh48, Lloh52			; CHECK-NEXT: .loh AdrpLdr Lloh48, Lloh52
	; CHECK-NEXT: .loh AdrpLdr Lloh47, Lloh51			; CHECK-NEXT: .loh AdrpLdr Lloh47, Lloh51
	; CHECK-NEXT: .loh AdrpLdr Lloh46, Lloh50			; CHECK-NEXT: .loh AdrpLdr Lloh46, Lloh50
	Show All 17 Lines
	; CHECK-BE-NEXT: ld1 { v4.16b }, [x1]			; CHECK-BE-NEXT: ld1 { v4.16b }, [x1]
	; CHECK-BE-NEXT: add x8, x0, #16			; CHECK-BE-NEXT: add x8, x0, #16
	; CHECK-BE-NEXT: add x9, x0, #32			; CHECK-BE-NEXT: add x9, x0, #32
	; CHECK-BE-NEXT: add x10, x0, #48			; CHECK-BE-NEXT: add x10, x0, #48
	; CHECK-BE-NEXT: ld1 { v6.8h }, [x0]			; CHECK-BE-NEXT: ld1 { v6.8h }, [x0]
	; CHECK-BE-NEXT: subs w2, w2, #1			; CHECK-BE-NEXT: subs w2, w2, #1
	; CHECK-BE-NEXT: add x1, x1, #16			; CHECK-BE-NEXT: add x1, x1, #16
	; CHECK-BE-NEXT: ld1 { v17.8h }, [x8]			; CHECK-BE-NEXT: ld1 { v17.8h }, [x8]
	; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b			; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v2.16b
	; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v0.16b			; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b
	; CHECK-BE-NEXT: tbl v16.16b, { v4.16b }, v3.16b			; CHECK-BE-NEXT: tbl v16.16b, { v4.16b }, v0.16b
	; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v2.16b			; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v3.16b
	; CHECK-BE-NEXT: rev32 v5.16b, v5.16b			; CHECK-BE-NEXT: rev32 v5.16b, v5.16b
	; CHECK-BE-NEXT: rev32 v7.16b, v7.16b			; CHECK-BE-NEXT: rev16 v7.16b, v7.16b
	; CHECK-BE-NEXT: rev32 v16.16b, v16.16b			; CHECK-BE-NEXT: rev16 v16.16b, v16.16b
	; CHECK-BE-NEXT: rev32 v4.16b, v4.16b			; CHECK-BE-NEXT: rev32 v4.16b, v4.16b
	; CHECK-BE-NEXT: xtn v5.4h, v5.4s			; CHECK-BE-NEXT: xtn v5.4h, v5.4s
	; CHECK-BE-NEXT: ext v18.16b, v17.16b, v17.16b, #8			; CHECK-BE-NEXT: uzp1 v7.8h, v0.8h, v7.8h
	; CHECK-BE-NEXT: xtn v7.4h, v7.4s
	; CHECK-BE-NEXT: umull v5.4s, v6.4h, v5.4h			; CHECK-BE-NEXT: umull v5.4s, v6.4h, v5.4h
	; CHECK-BE-NEXT: ext v6.16b, v6.16b, v6.16b, #8
	; CHECK-BE-NEXT: xtn v4.4h, v4.4s			; CHECK-BE-NEXT: xtn v4.4h, v4.4s
				; CHECK-BE-NEXT: umull2 v6.4s, v6.8h, v7.8h
	; CHECK-BE-NEXT: st1 { v5.4s }, [x0]			; CHECK-BE-NEXT: st1 { v5.4s }, [x0]
	; CHECK-BE-NEXT: xtn v5.4h, v16.4s
	; CHECK-BE-NEXT: umull v6.4s, v6.4h, v7.4h
	; CHECK-BE-NEXT: mov x0, x8			; CHECK-BE-NEXT: mov x0, x8
	; CHECK-BE-NEXT: umull v5.4s, v17.4h, v5.4h			; CHECK-BE-NEXT: uzp1 v5.8h, v0.8h, v16.8h
	; CHECK-BE-NEXT: umull v4.4s, v18.4h, v4.4h			; CHECK-BE-NEXT: umull v4.4s, v17.4h, v4.4h
	; CHECK-BE-NEXT: st1 { v6.4s }, [x8]			; CHECK-BE-NEXT: st1 { v6.4s }, [x8]
	; CHECK-BE-NEXT: st1 { v5.4s }, [x9]			; CHECK-BE-NEXT: umull2 v5.4s, v17.8h, v5.8h
	; CHECK-BE-NEXT: st1 { v4.4s }, [x10]			; CHECK-BE-NEXT: st1 { v4.4s }, [x9]
				; CHECK-BE-NEXT: st1 { v5.4s }, [x10]
	; CHECK-BE-NEXT: b.ne .LBB24_1			; CHECK-BE-NEXT: b.ne .LBB24_1
	; CHECK-BE-NEXT: // %bb.2: // %exit			; CHECK-BE-NEXT: // %bb.2: // %exit
	; CHECK-BE-NEXT: mov w0, wzr			; CHECK-BE-NEXT: mov w0, wzr
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret
	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	Show All 16 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 527022

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/aarch64-smull.ll

llvm/test/CodeGen/AArch64/zext-to-tbl.ll

[AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2
ClosedPublic