gcc generates less instructions than llvm from below intrinsic example.
#include <arm_neon.h>
uint8x16_t foo(uint8_t *a, uint8_t *b) {
return vcombine_u8(vld1_dup_u8(a), vld1_dup_u8(b));
}
gcc output
foo:
ld1r {v0.8b}, [x0]
ld1r {v1.8b}, [x1]
ins v0.d[1], v1.d[0]
ret
llvm output
foo: // @foo
ldrb w8, [x0]
fmov s0, w8
mov v0.b[1], w8
mov v0.b[2], w8
mov v0.b[3], w8
mov v0.b[4], w8
mov v0.b[5], w8
mov v0.b[6], w8
mov v0.b[7], w8
ldrb w8, [x1]
mov v0.b[8], w8
mov v0.b[9], w8
mov v0.b[10], w8
mov v0.b[11], w8
mov v0.b[12], w8
mov v0.b[13], w8
mov v0.b[14], w8
mov v0.b[15], w8
retIf vector has two different values and it can be splitted into two sub vectors with same length, generate two DUP and CONCAT_VECTORS with them.
For example,
t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
t24, t24, t24, t24, t24, t24, t24, t24
==>
t26: v8i8 = AArch64ISD::DUP t23
t28: v8i8 = AArch64ISD::DUP t24
t29: v16i8 = concat_vectors t26, t28With this patch, llvm generates below output.
foo: // @foo
ld1r { v1.8b }, [x1]
ld1r { v0.8b }, [x0]
mov v0.d[1], v1.d[0]
ret
The name "MaskVec" is a bit confusing in this context...
Maybe it would be easier to understand if you just construct it later, in the if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) codepath?