gcc generates less instructions than llvm from below intrinsic example. The example has mentioned on https://reviews.llvm.org/D148134.
#include <arm_neon.h> uint8x8_t test1(uint8x8_t a) { return vdup_n_u8(vrshrd_n_u64(vaddlv_u8(a), 3)); } gcc output test1: uaddlv h0, v0.8b umov w0, v0.h[0] fmov d0, x0 urshr d0, d0, 3 dup v0.8b, v0.b[0] ret llvm output test1: // @test1 uaddlv h0, v0.8b fmov w8, s0 and w8, w8, #0xffff fmov d0, x8 urshr d0, d0, #3 fmov x8, d0 dup v0.8b, w8 ret
With this patch's tablegen pattern, llvm generates below output.
test1: // @test1 uaddlv h0, v0.8b urshr d0, d0, #3 fmov x8, d0 dup v0.8b, w8 ret