Index: lib/Target/X86/X86CallingConv.td
===================================================================
--- lib/Target/X86/X86CallingConv.td
+++ lib/Target/X86/X86CallingConv.td
@@ -76,6 +76,9 @@
     // Promote i1/i8/i16 arguments to i32.
     CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
+    // Promote v8i1/v16i1/v32i1 arguments to i32.
+    CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
+
     // bool, char, int, enum, long, pointer --> GPR
     CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
 
@@ -89,9 +92,6 @@
     CCIfSubtarget<"is32Bit()", CCIfType<[i64], 
       CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
 
-    // TODO: Handle the case of mask types (v*i1)
-    CCIfType<[v8i1, v16i1, v32i1], CCCustom<"CC_X86_RegCall_Error">>,
-
     // float, double, float128 --> XMM
     // In the case of SSE disabled --> save to stack
     CCIfType<[f32, f64, f128], 
@@ -146,9 +146,15 @@
 ]>;
 
 def RetCC_#NAME : CallingConv<[
-    // Promote i1 arguments to i8.
-    CCIfType<[i1], CCPromoteToType<i8>>,
+    // Promote i1, v8i1 arguments to i8.
+    CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
 
+    // Promote v16i1 arguments to i16.
+    CCIfType<[v16i1], CCPromoteToType<i16>>,
+
+    // Promote v32i1 arguments to i32.
+    CCIfType<[v32i1], CCPromoteToType<i32>>,
+
     // bool, char, int, enum, long, pointer --> GPR
     CCIfType<[i8], CCAssignToReg<RC.GPR_8>>,
     CCIfType<[i16], CCAssignToReg<RC.GPR_16>>,
@@ -164,9 +170,6 @@
     CCIfSubtarget<"is32Bit()", CCIfType<[i64], 
       CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
 
-    // TODO: Handle the case of mask types (v*i1)
-    CCIfType<[v8i1, v16i1, v32i1], CCCustom<"CC_X86_RegCall_Error">>,
-
     // long double --> FP
     CCIfType<[f80], CCAssignToReg<RC.FP_RET>>,
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -2100,8 +2100,29 @@
                                const SDLoc &Dl, SelectionDAG &DAG) {
   EVT ValVT = ValArg.getValueType();
 
-  if (ValVT == MVT::v64i1 && ValLoc == MVT::i64) {
+  if (ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) {
+    // Two stage lowering might be required
+    // bitcast:   v8i1 -> i8
+    // anyextend: i8   -> i32
+    SDValue ValToCopy = DAG.getBitcast(MVT::i8, ValArg);
+    return (ValLoc != MVT::i8)
+               ? DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy)
+               : ValToCopy;
+  } else if (ValVT == MVT::v16i1 &&
+             (ValLoc == MVT::i16 || ValLoc == MVT::i32)) {
+    // Two stage lowering might be required
+    // bitcast:   v16i1 -> i16
+    // anyextend: i16   -> i32
+    SDValue ValToCopy = DAG.getBitcast(MVT::i16, ValArg);
+    return (ValLoc != MVT::i16)
+               ? DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy)
+               : ValToCopy;
+  } else if (ValVT == MVT::v32i1 && ValLoc == MVT::i32) {
     // One stage lowering is required
+    // bitcast:   v32i1 -> i32
+    return DAG.getBitcast(MVT::i32, ValArg);
+  } else if (ValVT == MVT::v64i1 && ValLoc == MVT::i64) {
+    // One stage lowering is required
     // bitcast:   v64i1 -> i64
     return DAG.getBitcast(MVT::i64, ValArg);
   } else
@@ -2435,20 +2456,26 @@
 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                                const EVT &ValLoc, const SDLoc &Dl,
                                SelectionDAG &DAG) {
-  assert((ValLoc == MVT::i64 || ValLoc == MVT::i32) &&
-         "Expecting register location of size 32/64 bit");
+  SDValue ValReturned = ValArg;
 
-  // Currently not referenced - will be used in other mask lowering
-  (void)Dl;
-
-  // In the case of v64i1 no special handling is required due to two reasons:
-  // In 32 bit machine, this case is handled by getv64i1Argument
-  // In 64 bit machine, There is no need to truncate the value only bitcast
-  if (ValVT == MVT::v64i1 && ValLoc == MVT::i32) {
-    llvm_unreachable("Expecting only i64 locations");
+  if (ValVT == MVT::v8i1) {
+    ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, MVT::i8, ValReturned);
+  } else if (ValVT == MVT::v16i1) {
+    ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, MVT::i16, ValReturned);
+  } else if (ValVT == MVT::v32i1) {
+    if (ValLoc == MVT::i64)
+      ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, MVT::i32, ValReturned);
+  } else if (ValVT == MVT::v64i1) {
+    // In the case of v64i1 no special handling is required due to two reasons:
+    // In 32 bit machine, this case is handled by getv64i1Argument
+    // In 64 bit machine, There is no need to truncate the value only bitcast
+    if (ValLoc == MVT::i32)
+      llvm_unreachable("Expecting only i64 locations");
+  } else {
+    llvm_unreachable("Expecting a vector of i1 types");
   }
 
-  return DAG.getBitcast(ValVT, ValArg);
+  return DAG.getBitcast(ValVT, ValReturned);
 }
 
 /// Lower the result values of a call into the
@@ -2509,8 +2536,9 @@
 
     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
       if (VA.getValVT().isVector() &&
-          (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64)) {
-        // promoting a mask type (v*i1) into a register of type i64/i32
+          ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+           (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
       } else
         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
@@ -2863,8 +2891,9 @@
           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
         else if (VA.getValVT().isVector() &&
                  VA.getValVT().getScalarType() == MVT::i1 &&
-                 ((RegVT == MVT::i32) || (RegVT == MVT::i64))) {
-          // Promoting a mask type (v*i1) into a register of type i64/i32
+                 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+                  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
         } else
           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
Index: test/CodeGen/X86/avx512-regcall-Mask.ll
===================================================================
--- test/CodeGen/X86/avx512-regcall-Mask.ll
+++ test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -193,3 +193,199 @@
   %call = call x86_regcallcc <64 x i1> @test_retv64i1()
   ret <64 x i1> %call
 }
+
+; X32-LABEL:  test_argv32i1:
+; X32:        addl    %ecx, %eax
+; X32:        addl    %edx, %eax
+; X32:        retl
+
+; WIN64-LABEL: test_argv32i1:
+; WIN64:       addl    %ecx, %eax
+; WIN64:       addl    %edx, %eax
+; WIN64:       retq
+
+; Test regcall when receiving arguments of v32i1 type
+define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1> %x2)  {
+  %a = bitcast <32 x i1> %x0 to i32
+  %b = bitcast <32 x i1> %x1 to i32
+  %c = bitcast <32 x i1> %x2 to i32
+  %add1 = add i32 %a, %b
+  %add2 = add i32 %add1, %c
+  ret i32 %add2
+}
+
+; X32-LABEL:  caller_argv32i1:
+; X32:        mov{{.*}}    $1, %eax
+; X32:        mov{{.*}}    $1, %ecx
+; X32:        call{{.*}}   _test_argv32i1
+
+; WIN64-LABEL: caller_argv32i1:
+; WIN64:       mov{{.*}}   $1, %eax
+; WIN64:       mov{{.*}}   $1, %ecx
+; WIN64:       mov{{.*}}   $1, %edx
+; WIN64:       call{{.*}}  test_argv32i1
+
+; Test regcall when passing arguments of v32i1 type
+define x86_regcallcc i32 @caller_argv32i1() #0 {
+entry:
+  %v0 = bitcast i32 1 to <32 x i1>
+  %call = call x86_regcallcc i32 @test_argv32i1(<32 x i1> %v0, <32 x i1> %v0, <32 x i1> %v0)
+  ret i32 %call
+}
+
+; X32-LABEL: test_retv32i1:
+; X32:       movl    $1, %eax
+; X32:       retl
+
+; WIN64-LABEL: test_retv32i1:
+; WIN64:       movl $1, %eax
+; WIN64:       retq
+
+; Test regcall when returning v32i1 type
+define x86_regcallcc <32 x i1> @test_retv32i1()  {
+  %a = bitcast i32 1 to <32 x i1>
+  ret <32 x i1> %a
+}
+
+; X32-LABEL: caller_retv32i1:
+; X32:       call{{.*}}   _test_retv32i1
+; X32:       incl %eax
+
+; Test regcall when processing result of v32i1 type
+define x86_regcallcc i32 @caller_retv32i1() #0 {
+entry:
+  %call = call x86_regcallcc <32 x i1> @test_retv32i1()
+  %c = bitcast <32 x i1> %call to i32
+  %add = add i32 %c, 1
+  ret i32 %add
+}
+
+; X32-LABEL:  test_argv16i1:
+; X32:        addl    %ecx, %eax
+; X32:        addl    %edx, %eax
+; X32:        retl
+
+; WIN64-LABEL: test_argv16i1:
+; WIN64:       addl    %ecx, %eax
+; WIN64:       addl    %edx, %eax
+; WIN64:       retq
+
+; Test regcall when receiving arguments of v16i1 type
+define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)  {
+  %a = bitcast <16 x i1> %x0 to i16
+  %b = bitcast <16 x i1> %x1 to i16
+  %c = bitcast <16 x i1> %x2 to i16
+  %add1 = add i16 %a, %b
+  %add2 = add i16 %add1, %c
+  ret i16 %add2
+}
+
+; X32-LABEL:  caller_argv16i1:
+; X32:        movl    $1, %eax
+; X32:        movl    $1, %ecx
+; X32:        calll   _test_argv16i1
+        
+; WIN64-LABEL: caller_argv16i1:
+; WIN64:       movl    $1, %eax
+; WIN64:       movl    $1, %ecx
+; WIN64:       movl    $1, %edx
+; WIN64:       callq   test_argv16i1
+
+; Test regcall when passing arguments of v16i1 type
+define x86_regcallcc i16 @caller_argv16i1() #0 {
+entry:
+  %v0 = bitcast i16 1 to <16 x i1>
+  %call = call x86_regcallcc i16 @test_argv16i1(<16 x i1> %v0, <16 x i1> %v0, <16 x i1> %v0)
+  ret i16 %call
+}
+
+; X32-LABEL: test_retv16i1:
+; X32:       movw    $1, %ax
+; X32:       retl
+
+; WIN64-LABEL: test_retv16i1:
+; WIN64:       movw $1, %ax
+; WIN64:       retq
+
+; Test regcall when returning v16i1 type
+define x86_regcallcc <16 x i1> @test_retv16i1()  {
+  %a = bitcast i16 1 to <16 x i1>
+  ret <16 x i1> %a
+}
+
+; X32-LABEL: caller_retv16i1:
+; X32:       calll   _test_retv16i1
+; X32:       incl   %eax
+
+; Test regcall when processing result of v16i1 type
+define x86_regcallcc i16 @caller_retv16i1() #0 {
+entry:
+  %call = call x86_regcallcc <16 x i1> @test_retv16i1()
+  %c = bitcast <16 x i1> %call to i16
+  %add = add i16 %c, 1
+  ret i16 %add
+}
+
+; X32-LABEL:  test_argv8i1:
+; X32:        addb   %cl, %al
+; X32:        addb   %dl, %al
+; X32:        retl
+
+; WIN64-LABEL: test_argv8i1:
+; WIN64:       addb   %cl, %al
+; WIN64:       addb   %dl, %al
+; WIN64:       retq
+
+; Test regcall when receiving arguments of v8i1 type
+define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)  {
+  %a = bitcast <8 x i1> %x0 to i8
+  %b = bitcast <8 x i1> %x1 to i8
+  %c = bitcast <8 x i1> %x2 to i8
+  %add1 = add i8 %a, %b
+  %add2 = add i8 %add1, %c
+  ret i8 %add2
+}
+
+; X32-LABEL:  caller_argv8i1:
+; X32:        movl    $1, %eax
+; X32:        movl    $1, %ecx
+; X32:        calll   _test_argv8i1
+
+; WIN64-LABEL: caller_argv8i1:
+; WIN64:       movl    $1, %eax
+; WIN64:       movl    $1, %ecx
+; WIN64:       movl    $1, %edx
+; WIN64:       callq   test_argv8i1
+
+; Test regcall when passing arguments of v8i1 type
+define x86_regcallcc i8 @caller_argv8i1() #0 {
+entry:
+  %v0 = bitcast i8 1 to <8 x i1>
+  %call = call x86_regcallcc i8 @test_argv8i1(<8 x i1> %v0, <8 x i1> %v0, <8 x i1> %v0)
+  ret i8 %call
+}
+
+; X32-LABEL: test_retv8i1:
+; X32:       movb    $1, %al
+; X32:       retl
+
+; WIN64-LABEL: test_retv8i1:
+; WIN64:       movb $1, %al
+; WIN64:       retq
+
+; Test regcall when returning v8i1 type
+define x86_regcallcc <8 x i1> @test_retv8i1()  {
+  %a = bitcast i8 1 to <8 x i1>
+  ret <8 x i1> %a
+}
+
+; X32-LABEL: caller_retv8i1:
+; X32:       calll   _test_retv8i1
+; X32:       retl
+
+; Test regcall when processing result of v8i1 type
+define x86_regcallcc <8 x i1> @caller_retv8i1() #0 {
+entry:
+  %call = call x86_regcallcc <8 x i1> @test_retv8i1()
+  ret <8 x i1> %call
+}