Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4638,8 +4638,10 @@
 }
 
 // Use the more efficient MOVI instead of DUP from ZR to zero up vectors
-def : Pat<(v2f32 (AArch64dup (f32 fpimm0))), (MOVIv2i32  (i32 0), (i32 0))>;
+def : Pat<(v1f64 (AArch64dup (f64 fpimm0))), (MOVID     (i32 0))>;
+def : Pat<(v2f32 (AArch64dup (f32 fpimm0))), (MOVIv2i32 (i32 0), (i32 0))>;
 
+def : Pat<(v1i64 (AArch64dup (i32 0))), (MOVID      (i32 0))>;
 def : Pat<(v2i32 (AArch64dup (i32 0))), (MOVIv2i32  (i32 0), (i32 0))>;
 def : Pat<(v4i16 (AArch64dup (i32 0))), (MOVIv4i16  (i32 0), (i32 0))>;
 def : Pat<(v8i8  (AArch64dup (i32 0))), (MOVIv8b_ns (i32 0))>;
@@ -4652,6 +4654,17 @@
 def : Pat<(v8i16 (AArch64dup (i32 0))), (MOVIv8i16   (i32 0), (i32 0))>;
 def : Pat<(v16i8 (AArch64dup (i32 0))), (MOVIv16b_ns (i32 0))>;
 
+// Use the more efficient MOVI instead of DUP from register to set vector masks
+def : Pat<(v1i64 (AArch64dup (i64     -1))), (MOVID      (i32 -1))>;
+def : Pat<(v2i32 (AArch64dup (i32     -1))), (MOVIv2i32  (i32 -1), (i32 0))>;
+def : Pat<(v4i16 (AArch64dup (i32 0xffff))), (MOVIv4i16  (i32 -1), (i32 0))>;
+def : Pat<(v8i8  (AArch64dup (i32   0xff))), (MOVIv8b_ns (i32 -1))>;
+
+def : Pat<(v2i64 (AArch64dup (i64     -1))), (MOVIv2d_ns  (i32 -1))>;
+def : Pat<(v4i32 (AArch64dup (i32     -1))), (MOVIv4i32   (i32 -1), (i32 0))>;
+def : Pat<(v8i16 (AArch64dup (i32 0xffff))), (MOVIv8i16   (i32 -1), (i32 0))>;
+def : Pat<(v16i8 (AArch64dup (i32   0xff))), (MOVIv16b_ns (i32 -1))>;
+
 // AdvSIMD MVNI
 
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
Index: llvm/test/CodeGen/AArch64/build-one-lane.ll
===================================================================
--- llvm/test/CodeGen/AArch64/build-one-lane.ll
+++ llvm/test/CodeGen/AArch64/build-one-lane.ll
@@ -3,7 +3,7 @@
 ; Check that building up a vector w/ only one non-zero lane initializes
 ; intelligently.
 
-define <8 x i8> @v8i8(i8 %t, i8 %s) nounwind {
+define <8 x i8> @v8i8z(i8 %t, i8 %s) nounwind {
   %v = insertelement <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef>, i8 %s, i32 7
   ret <8 x i8> %v
 
@@ -11,7 +11,7 @@
 ; CHECK: mov  v[[R]].b[7], w{{[0-9]+}}
 }
 
-define <16 x i8> @v16i8(i8 %t, i8 %s) nounwind {
+define <16 x i8> @v16i8z(i8 %t, i8 %s) nounwind {
   %v = insertelement <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef>, i8 %s, i32 15
   ret <16 x i8> %v
 
@@ -19,7 +19,7 @@
 ; CHECK: mov  v[[R]].b[15], w{{[0-9]+}}
 }
 
-define <4 x i16> @v4i16(i16 %t, i16 %s) nounwind {
+define <4 x i16> @v4i16z(i16 %t, i16 %s) nounwind {
   %v = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, i16 %s, i32 3
   ret <4 x i16> %v
 
@@ -27,7 +27,7 @@
 ; CHECK: mov  v[[R]].h[3], w{{[0-9]+}}
 }
 
-define <8 x i16> @v8i16(i16 %t, i16 %s) nounwind {
+define <8 x i16> @v8i16z(i16 %t, i16 %s) nounwind {
   %v = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>, i16 %s, i32 7
   ret <8 x i16> %v
 
@@ -35,7 +35,7 @@
 ; CHECK: mov  v[[R]].h[7], w{{[0-9]+}}
 }
 
-define <2 x i32> @v2i32(i32 %t, i32 %s) nounwind {
+define <2 x i32> @v2i32z(i32 %t, i32 %s) nounwind {
   %v = insertelement <2 x i32> <i32 0, i32 undef>, i32 %s, i32 1
   ret <2 x i32> %v
 
@@ -43,7 +43,7 @@
 ; CHECK: mov  v[[R]].s[1], w{{[0-9]+}}
 }
 
-define <4 x i32> @v4i32(i32 %t, i32 %s) nounwind {
+define <4 x i32> @v4i32z(i32 %t, i32 %s) nounwind {
   %v = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, i32 %s, i32 3
   ret <4 x i32> %v
 
@@ -51,7 +51,7 @@
 ; CHECK: mov  v[[R]].s[3], w{{[0-9]+}}
 }
 
-define <2 x i64> @v2i64(i64 %t, i64 %s) nounwind {
+define <2 x i64> @v2i64z(i64 %t, i64 %s) nounwind {
   %v = insertelement <2 x i64> <i64 0, i64 undef>, i64 %s, i32 1
   ret <2 x i64> %v
 
@@ -59,7 +59,7 @@
 ; CHECK: mov  v[[R]].d[1], x{{[0-9]+}}
 }
 
-define <2 x float> @v2f32(float %t, float %s) nounwind {
+define <2 x float> @v2f32z(float %t, float %s) nounwind {
   %v = insertelement <2 x float> <float 0.0, float undef>, float %s, i32 1
   ret <2 x float> %v
 
@@ -67,7 +67,7 @@
 ; CHECK: mov  v[[R]].s[1], v{{[0-9]+}}.s[0]
 }
 
-define <4 x float> @v4f32(float %t, float %s) nounwind {
+define <4 x float> @v4f32z(float %t, float %s) nounwind {
   %v = insertelement <4 x float> <float 0.0, float 0.0, float 0.0, float undef>, float %s, i32 3
   ret <4 x float> %v
 
@@ -75,10 +75,83 @@
 ; CHECK: mov  v[[R]].s[3], v{{[0-9]+}}.s[0]
 }
 
-define <2 x double> @v2f64(double %t, double %s) nounwind {
+define <1 x double> @v1f64z(double %t, double %s) nounwind {
+  %v = insertelement <1 x double> <double undef>, double 0.0, i32 0
+  ret <1 x double> %v
+
+; CHECK: movi d{{[0-9]+}}, #0
+}
+
+define <2 x double> @v2f64z(double %t, double %s) nounwind {
   %v = insertelement <2 x double> <double 0.0, double undef>, double %s, i32 1
   ret <2 x double> %v
 
 ; CHECK: movi v[[R:[0-9]+]].2d, #0
 ; CHECK: mov  v[[R]].d[1], v{{[0-9]+}}.d[0]
 }
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+
+define <8 x i8> @v8i8m(i8 %t, i8 %s) nounwind {
+  %v = insertelement <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef>, i8 %s, i32 7
+  ret <8 x i8> %v
+
+; CHECK: movi v[[R:[0-9]+]].8b, #-1
+; CHECK: mov  v[[R]].b[7], w{{[0-9]+}}
+}
+
+define <16 x i8> @v16i8m(i8 %t, i8 %s) nounwind {
+  %v = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef>, i8 %s, i32 15
+  ret <16 x i8> %v
+
+; CHECK: movi v[[R:[0-9]+]].16b, #-1
+; CHECK: mov  v[[R]].b[15], w{{[0-9]+}}
+}
+
+define <4 x i16> @v4i16m(i16 %t, i16 %s) nounwind {
+  %v = insertelement <4 x i16> <i16 -1, i16 -1, i16 -1, i16 undef>, i16 %s, i32 3
+  ret <4 x i16> %v
+
+; CHECK: movi v[[R:[0-9]+]].4h, #-1
+; CHECK: mov  v[[R]].h[3], w{{[0-9]+}}
+}
+
+define <8 x i16> @v8i16m(i16 %t, i16 %s) nounwind {
+  %v = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 undef>, i16 %s, i32 7
+  ret <8 x i16> %v
+
+; CHECK: movi v[[R:[0-9]+]].8h, #-1
+; CHECK: mov  v[[R]].h[7], w{{[0-9]+}}
+}
+
+define <2 x i32> @v2i32m(i32 %t, i32 %s) nounwind {
+  %v = insertelement <2 x i32> <i32 -1, i32 undef>, i32 %s, i32 1
+  ret <2 x i32> %v
+
+; CHECK: movi v[[R:[0-9]+]].2s, #-1
+; CHECK: mov  v[[R]].s[1], w{{[0-9]+}}
+}
+
+define <4 x i32> @v4i32m(i32 %t, i32 %s) nounwind {
+  %v = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 undef>, i32 %s, i32 3
+  ret <4 x i32> %v
+
+; CHECK: movi v[[R:[0-9]+]].4s, #-1
+; CHECK: mov  v[[R]].s[3], w{{[0-9]+}}
+}
+
+define <1 x i64> @v1i64m(i64 %t, i64 %s) nounwind {
+  %v = insertelement <1 x i64> <i64 undef>, i64 -1, i32 0
+  ret <1 x i64> %v
+
+; CHECK: movi d{{[0-9]+}}, #0xffffffffffffffff
+}
+
+define <2 x i64> @v2i64m(i64 %t, i64 %s) nounwind {
+  %v = insertelement <2 x i64> <i64 -1, i64 undef>, i64 %s, i32 1
+  ret <2 x i64> %v
+
+; CHECK: movi v[[R:[0-9]+]].2d, #0xffffffffffffffff
+; CHECK: mov  v[[R]].d[1], x{{[0-9]+}}
+}