gcc generates less instructions than llvm from below intrinsic example. The example has mentioned on https://reviews.llvm.org/D148134.
#include <arm_neon.h>
uint8x8_t test1(uint8x8_t a) {
return vdup_n_u8(vrshrd_n_u64(vaddlv_u8(a), 3));
}
gcc output
test1:
uaddlv h0, v0.8b
umov w0, v0.h[0]
fmov d0, x0
urshr d0, d0, 3
dup v0.8b, v0.b[0]
ret
llvm output
test1: // @test1
uaddlv h0, v0.8b
fmov w8, s0
and w8, w8, #0xffff
fmov d0, x8
urshr d0, d0, #3
fmov x8, d0
dup v0.8b, w8
retWith this patch's tablegen pattern, llvm generates below output.
test1: // @test1 uaddlv h0, v0.8b urshr d0, d0, #3 fmov x8, d0 dup v0.8b, w8 ret