Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -802,6 +802,7 @@
     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
@@ -1122,7 +1123,7 @@
       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
-      // Don't lower v32i8 because there is no 128-bit byte mul
+      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
 
       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
@@ -1171,7 +1172,7 @@
       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
-      // Don't lower v32i8 because there is no 128-bit byte mul
+      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
     }
 
     // In the customized shift lowering, the legal cases in AVX2 will be
@@ -9878,7 +9879,7 @@
   int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
     return M >= NumElts;
   });
-  
+
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
                               DL, VT, V1, V2, Mask, Subtarget, DAG))
@@ -10630,7 +10631,7 @@
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
       }
     }
-    
+
     // Get the desired 128-bit vector chunk.
     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
 
@@ -15892,6 +15893,32 @@
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
+  // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
+  // pairs, multiply and truncate.
+  if (VT == MVT::v16i8 || VT == MVT::v32i8) {
+    MVT ExVT = (VT == MVT::v16i8 ? MVT::v8i16 : MVT::v16i16);
+    // Extract the lo parts, sign extend to i16 and multiply
+    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, A, A);
+    SDValue BLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, B, B);
+    ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo);
+    BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo);
+    ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, ExVT));
+    BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, ExVT));
+    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+    // Extract the hi parts, sign extend to i16 and multiply
+    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, A, A);
+    SDValue BHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, B, B);
+    AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi);
+    BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi);
+    AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, ExVT));
+    BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, ExVT));
+    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+    // Mask the lower 8bits of the lo/hi results and pack
+    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, ExVT));
+    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, ExVT));
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+  }
+
   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   if (VT == MVT::v4i32) {
     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
Index: test/CodeGen/X86/avx2-arith.ll
===================================================================
--- test/CodeGen/X86/avx2-arith.ll
+++ test/CodeGen/X86/avx2-arith.ll
@@ -60,6 +60,26 @@
   ret <16 x i16> %x
 }
 
+; CHECK: mul-v32i8
+; CHECK:      vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; CHECK-NEXT: vpsraw $8, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; CHECK-NEXT: vpsraw $8, %ymm3, %ymm3
+; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; CHECK-NEXT: vpsraw $8, %ymm1, %ymm1
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; CHECK-NEXT: vpsraw $8, %ymm0, %ymm0
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpand %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %x = mul <32 x i8> %i, %j
+  ret <32 x i8> %x
+}
+
 ; CHECK: mul-v4i64
 ; CHECK: vpmuludq %ymm
 ; CHECK-NEXT: vpsrlq $32, %ymm
Index: test/CodeGen/X86/pmul.ll
===================================================================
--- test/CodeGen/X86/pmul.ll
+++ test/CodeGen/X86/pmul.ll
@@ -1,6 +1,42 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
 
+define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
+; ALL-LABEL: mul8c:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; ALL-NEXT:    movdqa %xmm1, %xmm2
+; ALL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; ALL-NEXT:    psraw $8, %xmm2
+; ALL-NEXT:    movdqa %xmm0, %xmm3
+; ALL-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; ALL-NEXT:    psraw $8, %xmm3
+; ALL-NEXT:    pmullw %xmm2, %xmm3
+; ALL-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; ALL-NEXT:    pand %xmm2, %xmm3
+; ALL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; ALL-NEXT:    psraw $8, %xmm1
+; ALL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; ALL-NEXT:    psraw $8, %xmm0
+; ALL-NEXT:    pmullw %xmm1, %xmm0
+; ALL-NEXT:    pand %xmm2, %xmm0
+; ALL-NEXT:    packuswb %xmm3, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
+  ret <16 x i8> %A
+}
+
+define <8 x i16> @mul16c(<8 x i16> %i) nounwind  {
+; ALL-LABEL: mul16c:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; ALL-NEXT:    retq
+entry:
+  %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
+  ret <8 x i16> %A
+}
+
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
 ; SSE2-LABEL: a:
 ; SSE2:       # BB#0: # %entry
@@ -42,6 +78,41 @@
   ret <2 x i64> %A
 }
 
+define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind  {
+; ALL-LABEL: mul8:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movdqa %xmm1, %xmm2
+; ALL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; ALL-NEXT:    psraw $8, %xmm2
+; ALL-NEXT:    movdqa %xmm0, %xmm3
+; ALL-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; ALL-NEXT:    psraw $8, %xmm3
+; ALL-NEXT:    pmullw %xmm2, %xmm3
+; ALL-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; ALL-NEXT:    pand %xmm2, %xmm3
+; ALL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; ALL-NEXT:    psraw $8, %xmm1
+; ALL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; ALL-NEXT:    psraw $8, %xmm0
+; ALL-NEXT:    pmullw %xmm1, %xmm0
+; ALL-NEXT:    pand %xmm2, %xmm0
+; ALL-NEXT:    packuswb %xmm3, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %A = mul <16 x i8> %i, %j
+  ret <16 x i8> %A
+}
+
+define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind  {
+; ALL-LABEL: mul16:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    pmullw %xmm1, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %A = mul <8 x i16> %i, %j
+  ret <8 x i16> %A
+}
+
 define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
 ; SSE2-LABEL: c:
 ; SSE2:       # BB#0: # %entry