gcc generates less instructions than llvm from below intrinsic example.
#include <arm_neon.h>
uint8x16_t foo(uint8_t *a, uint8_t *b) {
    return vcombine_u8(vld1_dup_u8(a), vld1_dup_u8(b));
} 
gcc output
foo:
	ld1r	{v0.8b}, [x0]
	ld1r	{v1.8b}, [x1]
	ins	v0.d[1], v1.d[0]
	ret
llvm output
foo:                                    // @foo
        ldrb    w8, [x0]
        fmov    s0, w8
        mov     v0.b[1], w8
        mov     v0.b[2], w8
        mov     v0.b[3], w8
        mov     v0.b[4], w8
        mov     v0.b[5], w8
        mov     v0.b[6], w8
        mov     v0.b[7], w8
        ldrb    w8, [x1]
        mov     v0.b[8], w8
        mov     v0.b[9], w8
        mov     v0.b[10], w8
        mov     v0.b[11], w8
        mov     v0.b[12], w8
        mov     v0.b[13], w8
        mov     v0.b[14], w8
        mov     v0.b[15], w8
        retIf vector has two different values and it can be splitted into two sub vectors with same length, generate two DUP and CONCAT_VECTORS with them. 
For example,
 t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
                           t24, t24, t24, t24, t24, t24, t24, t24
==>
   t26: v8i8 = AArch64ISD::DUP t23
   t28: v8i8 = AArch64ISD::DUP t24
 t29: v16i8 = concat_vectors t26, t28With this patch, llvm generates below output.
foo:                                  // @foo
	ld1r	{ v1.8b }, [x1]
	ld1r	{ v0.8b }, [x0]
	mov	v0.d[1], v1.d[0]
	ret
The name "MaskVec" is a bit confusing in this context...
Maybe it would be easier to understand if you just construct it later, in the if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) codepath?