gcc generates less instructions than llvm from below intrinsic example.
#include <arm_neon.h>
void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
vmovn_s32(vld1q_s32(&p[4])));
acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
acc = vmlsl_s16(acc, vget_high_s16(a), vget_high_s16(b));
*out = acc;
}GCC output
foo:
ldp q2, q3, [x1]
uzp1 v2.8h, v2.8h, v3.8h
smlsl v1.4s, v0.4h, v2.4h
smlsl2 v1.4s, v0.8h, v2.8h
str q1, [x0]
retLLVM output
ldp q2, q3, [x1] ext v4.16b, v0.16b, v0.16b, #8 xtn v2.4h, v2.4s smlsl v1.4s, v0.4h, v2.4h xtn v0.4h, v3.4s smlsl v1.4s, v4.4h, v0.4h str q1, [x0] ret
It looks gcc keeps the intrinsic function calls with builtin function calls.
For example, the vmonv and vcombine intrinsic function calls are matched to the uzp1 pattern as below.
_4 = __builtin_aarch64_xtnv4si (_3);(insn 9 8 10 (set (reg:V4SI 107)
_6 = __builtin_aarch64_xtnv4si (_5);(insn 12 11 13 (set (reg:V4SI 109)
_7 = {_4, _6};
...
(insn 10 9 11 (set (reg:V8HI 108)
(vec_concat:V8HI (truncate:V4HI (reg:V4SI 107))
(const_vector:V4HI [
(const_int 0 [0]) repeated x4
])))
(nil))
(insn 11 10 0 (set (reg:V4HI 93 [ _5 ])
(subreg:V4HI (reg:V8HI 108) 0))
(nil))
(insn 13 12 14 (set (reg:V8HI 110)
(vec_concat:V8HI (truncate:V4HI (reg:V4SI 109))
(const_vector:V4HI [
(const_int 0 [0]) repeated x4
])))
(nil))
(insn 14 13 0 (set (reg:V4HI 95 [ _7 ])
(subreg:V4HI (reg:V8HI 110) 0))
(nil))
(insn 15 14 16 (set (reg:V8HI 111)
(vec_concat:V8HI (reg:V4HI 93 [ _5 ])
(reg:V4HI 95 [ _7 ])))
(nil))
...
(define_insn "*aarch64_narrow_trunc<mode>"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
(truncate:<VNARROWQ>
(match_operand:VQN 1 "register_operand" "w"))
(truncate:<VNARROWQ>
(match_operand:VQN 2 "register_operand" "w"))))]
"TARGET_SIMD"
{
if (!BYTES_BIG_ENDIAN)
return "uzp1\\t%0.<V2ntype>, %1.<V2ntype>, %2.<V2ntype>";
else
return "uzp1\\t%0.<V2ntype>, %2.<V2ntype>, %1.<V2ntype>";
}
[(set_attr "type" "neon_permute<q>")]
)It looks clang generates some intrinsic functions' deifintion. After inlining, some intrinsic function calls are optimized away as below.
define dso_local void @foo(<8 x i16> noundef %a, <4 x i32> noundef %acc, ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
entry:
%0 = load <4 x i32>, ptr %p, align 4
%vmovn.i = trunc <4 x i32> %0 to <4 x i16>
%arrayidx2 = getelementptr inbounds i32, ptr %p, i64 4
%1 = load <4 x i32>, ptr %arrayidx2, align 4
%vmovn.i17 = trunc <4 x i32> %1 to <4 x i16>
%shuffle.i18 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i18, <4 x i16> %vmovn.i)
%shuffle.i19 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vmull2.i.i20 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i19, <4 x i16> %vmovn.i17)
%2 = add <4 x i32> %vmull2.i.i, %vmull2.i.i20
%sub.i21 = sub <4 x i32> %acc, %2
store <4 x i32> %sub.i21, ptr %out, align 16, !tbaa !6
ret void
}For uzp1 instruction, it is hard to match existing pattern for uzp1 without concat_vectors which comes from vcombine_s16.
If clang does not generate the intrinsic function's definition and backend lowers the intrinsic function call, we could see similar code with gcc. However, I do not think it is good way. It could be better to generate the intrinsic function's definition and optimize the code after inlining.
Alternatively, I have tried to check the MIR code sequence with smlsl in AArch64MIPeepholeOpt pass. With this patch, the llvm output is as below.
foo:
ldp q2, q3, [x1]
uzp1 v2.8h, v2.8h, v3.8h
smlsl v1.4s, v0.4h, v2.4h
smlsl2 v1.4s, v0.8h, v2.8h
str q1, [x0]
ret
Perhaps name this tryCombineMULLWithUZP1, to show it operands on mull nodes.