Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1472,6 +1472,60 @@
       break;
     }
 
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packusdw:
+    case Intrinsic::x86_avx2_packuswb: {
+      // TODO Add support for Intrinsic::x86_avx512_mask_pack*
+      auto *Ty0 = II->getArgOperand(0)->getType();
+      unsigned InnerVWidth = Ty0->getVectorNumElements();
+      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+      unsigned VWidthPerLane = VWidth / NumLanes;
+      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+      // Per lane, pack the elements of the first input and then the second.
+      // e.g.
+      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+      for (int OpNum = 0; OpNum != 2; ++OpNum) {
+        APInt OpDemandedElts(InnerVWidth, 0);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          unsigned LaneIdx = Lane * VWidthPerLane;
+          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+            if (DemandedElts[Idx])
+              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+          }
+        }
+
+        // Demand elements from the operand.
+        auto *Op = II->getArgOperand(OpNum);
+        APInt OpUndefElts(InnerVWidth, 0);
+        TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
+                                          Depth + 1);
+        if (TmpV) {
+          II->setArgOperand(OpNum, TmpV);
+          MadeChange = true;
+        }
+
+        // Pack the operand's UNDEF elements, one lane at a time.
+        OpUndefElts = OpUndefElts.zext(VWidth);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+          LaneElts = LaneElts.shl(InnerVWidthPerLane * (2 * Lane + OpNum));
+          UndefElts |= LaneElts;
+        }
+      }
+      break;
+    }
+
     // PSHUFB
     case Intrinsic::x86_ssse3_pshuf_b_128:
     case Intrinsic::x86_avx2_pshuf_b:
Index: llvm/trunk/test/Transforms/InstCombine/x86-pack.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-pack.ll
+++ llvm/trunk/test/Transforms/InstCombine/x86-pack.ll
@@ -7,11 +7,9 @@
 
 define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
   %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
@@ -22,10 +20,8 @@
 
 define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> %a0, i32 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> %a1, i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> %a0, i32 0, i32 0
   %2 = insertelement <4 x i32> %a1, i32 0, i32 3
@@ -36,11 +32,9 @@
 
 define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @elts_packsswb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> %a0, i16 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> %a1, i16 0, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 ;
   %1 = insertelement <8 x i16> %a0, i16 0, i32 0
   %2 = insertelement <8 x i16> %a1, i16 0, i32 0
@@ -51,9 +45,7 @@
 
 define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @elts_packuswb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+; CHECK-NEXT:    ret <16 x i8> undef
 ;
   %1 = insertelement <8 x i16> undef, i16 0, i32 0
   %2 = insertelement <8 x i16> undef, i16 0, i32 0
@@ -64,10 +56,8 @@
 
 define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
@@ -79,7 +69,7 @@
 define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
@@ -92,11 +82,9 @@
 
 define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK-LABEL: @elts_packsswb_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i16> %a0, i16 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i16> %a1, i16 0, i32 8
-; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    ret <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
 ;
   %1 = insertelement <16 x i16> %a0, i16 0, i32 0
   %2 = insertelement <16 x i16> %a1, i16 0, i32 8
@@ -107,9 +95,7 @@
 
 define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK-LABEL: @elts_packuswb_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+; CHECK-NEXT:    ret <32 x i8> undef
 ;
   %1 = insertelement <16 x i16> undef, i16 0, i32 1
   %2 = insertelement <16 x i16> undef, i16 0, i32 0