Index: llvm/include/llvm/IR/IntrinsicsX86.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsX86.td
+++ llvm/include/llvm/IR/IntrinsicsX86.td
@@ -513,15 +513,6 @@
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty,
-                         llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">,
@@ -793,13 +784,6 @@
                     [IntrNoMem]>;
 }
 
-// Vector pack
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_packusdw        : GCCBuiltin<"__builtin_ia32_packusdw128">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-}
-
 // Vector insert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
@@ -1801,22 +1785,6 @@
                    llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
 }
 
-// Pack ops.
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_packsswb : GCCBuiltin<"__builtin_ia32_packsswb256">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
-                         llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_packssdw : GCCBuiltin<"__builtin_ia32_packssdw256">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
-                         llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty], [IntrNoMem]>;
-}
-
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">,
@@ -3758,22 +3726,6 @@
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
 }
 
-// Pack ops.
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">,
-              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">,
-              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
-                         [IntrNoMem]>;
-}
-
 // Vector convert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_cvtdq2ps_128 : // TODO: remove this intrinsic
Index: llvm/lib/IR/AutoUpgrade.cpp
===================================================================
--- llvm/lib/IR/AutoUpgrade.cpp
+++ llvm/lib/IR/AutoUpgrade.cpp
@@ -298,7 +298,11 @@
       Name.startswith("avx512.ptestnm") || //Added in 6.0
       Name.startswith("sse2.pavg") || // Added in 6.0
       Name.startswith("avx2.pavg") || // Added in 6.0
-      Name.startswith("avx512.mask.pavg")) // Added in 6.0
+      Name.startswith("avx512.mask.pavg") || // Added in 6.0
+      Name.startswith("sse2.pack") || // Added in 7.0
+      Name.startswith("sse41.pack") || // Added in 7.0
+      Name.startswith("avx2.pack") || // Added in 7.0
+      Name.startswith("avx512.pack")) // Added in 7.0
     return true;
 
   return false;
@@ -1024,6 +1028,45 @@
   return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
 }
 
+static Value *EmitX86Pack(IRBuilder<> &Builder, CallInst &CI, bool IsUnsigned,
+                          int EltSize) {
+  Value *A = CI.getArgOperand(0);
+  Value *B = CI.getArgOperand(1);
+
+  Type *Ty = A->getType();
+  APInt MinVal, MaxVal;
+  if (IsUnsigned) {
+    MinVal = APInt::getMinValue(EltSize / 2).zext(EltSize);
+    MaxVal = APInt::getMaxValue(EltSize / 2).zext(EltSize);
+  } else {
+    MinVal = APInt::getSignedMinValue(EltSize / 2).sext(EltSize);
+    MaxVal = APInt::getSignedMaxValue(EltSize / 2).sext(EltSize);
+  }
+
+  SmallVector<uint32_t, 16> ShuffleMask;
+  unsigned NumElts = Ty->getVectorNumElements();
+  unsigned NumLanes = NumElts * Ty->getScalarSizeInBits() / 128;
+  unsigned NumEltsPerLane = 128 / EltSize;
+
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt)
+      ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane));
+    for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt)
+      ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane) + NumElts);
+  }
+
+  Value *Res = Builder.CreateShuffleVector(A, B, ShuffleMask);
+  Type *RTy = Res->getType();
+  Value *MinVec = ConstantInt::get(RTy, MinVal);
+  Value *MaxVec = ConstantInt::get(RTy, MaxVal);
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SLT, Res, MaxVec);
+  Res = Builder.CreateSelect(Cmp, Res, MaxVec);
+  Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Res, MinVec);
+  Res = Builder.CreateSelect(Cmp, Res, MinVec);
+  Type *VTy = VectorType::get(Builder.getIntNTy(EltSize / 2), NumElts * 2);
+  return Builder.CreateTrunc(Res, VTy);
+}
+
 // Replace intrinsic with unmasked version and a select.
 static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
                                       CallInst &CI, Value *&Rep) {
@@ -1108,42 +1151,12 @@
       IID = Intrinsic::x86_avx512_pmaddubs_w_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packsswb.")) {
-    if (VecWidth == 128)
-      IID = Intrinsic::x86_sse2_packsswb_128;
-    else if (VecWidth == 256)
-      IID = Intrinsic::x86_avx2_packsswb;
-    else if (VecWidth == 512)
-      IID = Intrinsic::x86_avx512_packsswb_512;
-    else
-      llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packssdw.")) {
-    if (VecWidth == 128)
-      IID = Intrinsic::x86_sse2_packssdw_128;
-    else if (VecWidth == 256)
-      IID = Intrinsic::x86_avx2_packssdw;
-    else if (VecWidth == 512)
-      IID = Intrinsic::x86_avx512_packssdw_512;
-    else
-      llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packuswb.")) {
-    if (VecWidth == 128)
-      IID = Intrinsic::x86_sse2_packuswb_128;
-    else if (VecWidth == 256)
-      IID = Intrinsic::x86_avx2_packuswb;
-    else if (VecWidth == 512)
-      IID = Intrinsic::x86_avx512_packuswb_512;
-    else
-      llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packusdw.")) {
-    if (VecWidth == 128)
-      IID = Intrinsic::x86_sse41_packusdw;
-    else if (VecWidth == 256)
-      IID = Intrinsic::x86_avx2_packusdw;
-    else if (VecWidth == 512)
-      IID = Intrinsic::x86_avx512_packusdw_512;
-    else
-      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pack")) {
+    bool IsUnsigned = Name[4] == 'u';
+    int EltSize = (Name[6] == 'd') ? 32 : 16;
+    Rep = EmitX86Pack(Builder, CI, IsUnsigned, EltSize);
+    Rep = EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
+    return true;
   } else if (Name.startswith("vpermilvar.")) {
     if (VecWidth == 128 && EltWidth == 32)
       IID = Intrinsic::x86_avx_vpermilvar_ps;
@@ -2100,6 +2113,21 @@
                                { CI->getArgOperand(0), Builder.getInt1(false) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
+    } else if (IsX86 &&
+               (Name.startswith("sse2.pack") || Name.startswith("sse41.pack") ||
+                Name.startswith("avx2.pack") ||
+                Name.startswith("avx512.pack"))) {
+      int L = Name.size();
+      L = (Name[L - 4] == '.') ? L - 4 : L;
+      bool IsUnsigned = Name[L - 4] == 'u';
+      int EltSize = (Name[L - 2] == 'd') ? 32 : 16;
+      Rep = EmitX86Pack(Builder, *CI, IsUnsigned, EltSize);
+    } else if (IsX86 && Name.startswith("avx512.mask.pack")) {
+      bool IsUnsigned = Name[16] == 'u';
+      int EltSize = (Name[18] == 'd') ? 32 : 16;
+      Rep = EmitX86Pack(Builder, *CI, IsUnsigned, EltSize);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34692,6 +34692,77 @@
   return detectUSatPattern(In, VT);
 }
 
+/// Detect a pattern of shuffling word or dword elements of two vectors together
+/// in lanes of 256 bits to be compressed to 128 bits by PACK intructions and
+/// return the original vectors in parameters A and B.
+static bool tracePackVectorShuffle(SDValue SatVal, EVT VT,
+                                   const X86Subtarget &Subtarget,
+                                   bool IsUnsigned, SDValue &A, SDValue &B) {
+  // A 128 bit PACK op receives just a concat of its inputs.
+  if (VT.getSizeInBits() == 128) {
+    if (!Subtarget.hasSSE2())
+      return false;
+    if (SatVal.getOpcode() == ISD::CONCAT_VECTORS &&
+        SatVal.getNumOperands() == 2) {
+      A = SatVal.getOperand(0);
+      B = SatVal.getOperand(1);
+      // Check that this isn't a PACKUSDW pattern without SSE4.1.
+      if (IsUnsigned && A.getValueType().getScalarType() == MVT::i32 &&
+          !Subtarget.hasSSE41())
+        return false;
+      return true;
+    }
+    return false;
+  }
+
+  // Check for AVX2 and AVX512 features.
+  if (VT.getSizeInBits() == 256 && !Subtarget.hasAVX2())
+    return false;
+  if (VT.getSizeInBits() == 512 && !Subtarget.useBWIRegs())
+    return false;
+
+  // Check that the pattern is a shuffle of two vectors, both of which are
+  // the original inputs expanded to the same number of elements as the output
+  // through concatenation.
+  if (SatVal.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+  auto Shuffle = cast<ShuffleVectorSDNode>(SatVal.getNode());
+  A = Shuffle->getOperand(0);
+  B = Shuffle->getOperand(1);
+  // Cases where A == B get optimized to a distinct unary pattern.
+  bool IsUnary = false;
+  if (B.isUndef()) {
+    IsUnary = true;
+    B = A;
+  }
+  if (A.getOpcode() != ISD::CONCAT_VECTORS || A.getNumOperands() != 2 || 
+      B.getOpcode() != ISD::CONCAT_VECTORS || B.getNumOperands() != 2)
+    return false;
+  // Get the original inputs of the pattern.
+  A = A.getOperand(0);
+  B = B.getOperand(0);
+
+  // Check the shuffle mask. createPackShuffleMask is not used here because it
+  // skips the odd-numbered elements of each lane in each input.
+  SmallVector<int, 16> ShuffleMask;
+  EVT InVT = A.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = IsUnary ? 0 : VT.getVectorNumElements();
+  unsigned NumLanes = InVT.getSizeInBits() / 128;
+  unsigned NumEltsPerLane = 128 / InVT.getScalarSizeInBits();
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt)
+      ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane));
+    for (unsigned Elt = 0; Elt != NumEltsPerLane; ++Elt)
+      ShuffleMask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+  }
+
+  for (unsigned i = 0; i < NumElts; ++i)
+    if (Shuffle->getMaskElt(i) != ShuffleMask[i])
+      return false;
+  return true;
+}
+
 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
@@ -34699,6 +34770,28 @@
   EVT InVT = In.getValueType();
   EVT InSVT = InVT.getScalarType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // A special case for a full PACK pattern, detecting the vector concatenation
+  // (if 128 bit) or lane shuffle (other cases).
+  // FIXME: the output pattern is basically the same as others, being truncation
+  // of saturation, but the need to generate VTRUNC over more generic patterns
+  // in truncateVectorWithPACK means that we are replicating checks here.
+  if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
+      ((SVT == MVT::i8 && InSVT == MVT::i16) ||
+       (SVT == MVT::i16 && InSVT == MVT::i32))) {
+    unsigned Opcode = 0;
+    SDValue SatVal;
+    if (auto SSatVal = detectSSatPattern(In, VT)) {
+      Opcode = X86ISD::PACKSS;
+      SatVal = SSatVal;
+    } else if (auto USatVal = detectSSatPattern(In, VT, true)) {
+      Opcode = X86ISD::PACKUS;
+      SatVal = USatVal;
+    }
+    SDValue A, B;
+    if (Opcode && tracePackVectorShuffle(SatVal, VT, Subtarget,
+                                         (Opcode == X86ISD::PACKUS), A, B))
+      return DAG.getNode(Opcode, DL, VT, A, B);
+  }
   if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
       isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
     if (auto SSatVal = detectSSatPattern(In, VT))
Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -398,10 +398,6 @@
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
-  X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
@@ -1438,10 +1434,6 @@
   X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
 
-  X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_pmaddubs_w_512, INTR_TYPE_2OP,
                      X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(avx512_pmaddw_d_512, INTR_TYPE_2OP,
@@ -1599,9 +1591,6 @@
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(sse2_min_sd,       INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
-  X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
@@ -1645,7 +1634,6 @@
   X86_INTRINSIC_DATA(sse3_hsub_pd,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
-  X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse41_phminposuw,  INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
   X86_INTRINSIC_DATA(sse41_round_pd,    ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(sse41_round_ps,    ROUNDP, X86ISD::VRNDSCALE, 0),
Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -566,81 +566,6 @@
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
-static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
-  Value *Arg0 = II.getArgOperand(0);
-  Value *Arg1 = II.getArgOperand(1);
-  Type *ResTy = II.getType();
-
-  // Fast all undef handling.
-  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
-    return UndefValue::get(ResTy);
-
-  Type *ArgTy = Arg0->getType();
-  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
-  unsigned NumDstElts = ResTy->getVectorNumElements();
-  unsigned NumSrcElts = ArgTy->getVectorNumElements();
-  assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");
-
-  unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
-  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
-  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
-  assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
-         "Unexpected packing types");
-
-  // Constant folding.
-  auto *Cst0 = dyn_cast<Constant>(Arg0);
-  auto *Cst1 = dyn_cast<Constant>(Arg1);
-  if (!Cst0 || !Cst1)
-    return nullptr;
-
-  SmallVector<Constant *, 32> Vals;
-  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-    for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
-      unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
-      auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0;
-      auto *COp = Cst->getAggregateElement(SrcIdx);
-      if (COp && isa<UndefValue>(COp)) {
-        Vals.push_back(UndefValue::get(ResTy->getScalarType()));
-        continue;
-      }
-
-      auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
-      if (!CInt)
-        return nullptr;
-
-      APInt Val = CInt->getValue();
-      assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
-             "Unexpected constant bitwidth");
-
-      if (IsSigned) {
-        // PACKSS: Truncate signed value with signed saturation.
-        // Source values less than dst minint are saturated to minint.
-        // Source values greater than dst maxint are saturated to maxint.
-        if (Val.isSignedIntN(DstScalarSizeInBits))
-          Val = Val.trunc(DstScalarSizeInBits);
-        else if (Val.isNegative())
-          Val = APInt::getSignedMinValue(DstScalarSizeInBits);
-        else
-          Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
-      } else {
-        // PACKUS: Truncate signed value with unsigned saturation.
-        // Source values less than zero are saturated to zero.
-        // Source values greater than dst maxuint are saturated to maxuint.
-        if (Val.isIntN(DstScalarSizeInBits))
-          Val = Val.trunc(DstScalarSizeInBits);
-        else if (Val.isNegative())
-          Val = APInt::getNullValue(DstScalarSizeInBits);
-        else
-          Val = APInt::getAllOnesValue(DstScalarSizeInBits);
-      }
-
-      Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
-    }
-  }
-
-  return ConstantVector::get(Vals);
-}
-
 static Value *simplifyX86movmsk(const IntrinsicInst &II) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();
@@ -2593,26 +2518,6 @@
       return replaceInstUsesWith(*II, V);
     break;
 
-  case Intrinsic::x86_sse2_packssdw_128:
-  case Intrinsic::x86_sse2_packsswb_128:
-  case Intrinsic::x86_avx2_packssdw:
-  case Intrinsic::x86_avx2_packsswb:
-  case Intrinsic::x86_avx512_packssdw_512:
-  case Intrinsic::x86_avx512_packsswb_512:
-    if (Value *V = simplifyX86pack(*II, true))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_packuswb_128:
-  case Intrinsic::x86_sse41_packusdw:
-  case Intrinsic::x86_avx2_packusdw:
-  case Intrinsic::x86_avx2_packuswb:
-  case Intrinsic::x86_avx512_packusdw_512:
-  case Intrinsic::x86_avx512_packuswb_512:
-    if (Value *V = simplifyX86pack(*II, false))
-      return replaceInstUsesWith(*II, V);
-    break;
-
   case Intrinsic::x86_pclmulqdq: {
     if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       unsigned Imm = C->getZExtValue();
Index: llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1436,63 +1436,6 @@
 
       break;
 
-    case Intrinsic::x86_sse2_packssdw_128:
-    case Intrinsic::x86_sse2_packsswb_128:
-    case Intrinsic::x86_sse2_packuswb_128:
-    case Intrinsic::x86_sse41_packusdw:
-    case Intrinsic::x86_avx2_packssdw:
-    case Intrinsic::x86_avx2_packsswb:
-    case Intrinsic::x86_avx2_packusdw:
-    case Intrinsic::x86_avx2_packuswb:
-    case Intrinsic::x86_avx512_packssdw_512:
-    case Intrinsic::x86_avx512_packsswb_512:
-    case Intrinsic::x86_avx512_packusdw_512:
-    case Intrinsic::x86_avx512_packuswb_512: {
-      auto *Ty0 = II->getArgOperand(0)->getType();
-      unsigned InnerVWidth = Ty0->getVectorNumElements();
-      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
-
-      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
-      unsigned VWidthPerLane = VWidth / NumLanes;
-      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
-
-      // Per lane, pack the elements of the first input and then the second.
-      // e.g.
-      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
-      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
-      for (int OpNum = 0; OpNum != 2; ++OpNum) {
-        APInt OpDemandedElts(InnerVWidth, 0);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          unsigned LaneIdx = Lane * VWidthPerLane;
-          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
-            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
-            if (DemandedElts[Idx])
-              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
-          }
-        }
-
-        // Demand elements from the operand.
-        auto *Op = II->getArgOperand(OpNum);
-        APInt OpUndefElts(InnerVWidth, 0);
-        TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
-                                          Depth + 1);
-        if (TmpV) {
-          II->setArgOperand(OpNum, TmpV);
-          MadeChange = true;
-        }
-
-        // Pack the operand's UNDEF elements, one lane at a time.
-        OpUndefElts = OpUndefElts.zext(VWidth);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
-          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
-          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
-          UndefElts |= LaneElts;
-        }
-      }
-      break;
-    }
-
     // PSHUFB
     case Intrinsic::x86_ssse3_pshuf_b_128:
     case Intrinsic::x86_avx2_pshuf_b:
Index: llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
===================================================================
--- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2374,22 +2374,6 @@
   // intrinsic.
   Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
     switch (id) {
-      case Intrinsic::x86_sse2_packsswb_128:
-      case Intrinsic::x86_sse2_packuswb_128:
-        return Intrinsic::x86_sse2_packsswb_128;
-
-      case Intrinsic::x86_sse2_packssdw_128:
-      case Intrinsic::x86_sse41_packusdw:
-        return Intrinsic::x86_sse2_packssdw_128;
-
-      case Intrinsic::x86_avx2_packsswb:
-      case Intrinsic::x86_avx2_packuswb:
-        return Intrinsic::x86_avx2_packsswb;
-
-      case Intrinsic::x86_avx2_packssdw:
-      case Intrinsic::x86_avx2_packusdw:
-        return Intrinsic::x86_avx2_packssdw;
-
       case Intrinsic::x86_mmx_packsswb:
       case Intrinsic::x86_mmx_packuswb:
         return Intrinsic::x86_mmx_packsswb;
@@ -2674,17 +2658,6 @@
       handleVectorShiftIntrinsic(I, /* Variable */ true);
       break;
 
-    case Intrinsic::x86_sse2_packsswb_128:
-    case Intrinsic::x86_sse2_packssdw_128:
-    case Intrinsic::x86_sse2_packuswb_128:
-    case Intrinsic::x86_sse41_packusdw:
-    case Intrinsic::x86_avx2_packsswb:
-    case Intrinsic::x86_avx2_packssdw:
-    case Intrinsic::x86_avx2_packuswb:
-    case Intrinsic::x86_avx2_packusdw:
-      handleVectorPackIntrinsic(I);
-      break;
-
     case Intrinsic::x86_mmx_packsswb:
     case Intrinsic::x86_mmx_packuswb:
       handleVectorPackIntrinsic(I, 16);
Index: llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
===================================================================
--- llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -1936,11 +1936,15 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
+  %1 = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = icmp slt <32 x i16> %1, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %4 = icmp sgt <32 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %call = trunc <32 x i16> %5 to <32 x i8>
   %res = bitcast <32 x i8> %call to <4 x i64>
   ret <4 x i64> %res
 }
-declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_packs_epi32:
@@ -1949,11 +1953,15 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
-  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
+  %1 = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %2 = icmp slt <16 x i32> %1, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %4 = icmp sgt <16 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %call = trunc <16 x i32> %5 to <16 x i16>
   %res = bitcast <16 x i16> %call to <4 x i64>
   ret <4 x i64> %res
 }
-declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_packus_epi16:
@@ -1962,11 +1970,15 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
+  %1 = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = icmp slt <32 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %4 = icmp sgt <32 x i16> %3, zeroinitializer
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
+  %call = trunc <32 x i16> %5 to <32 x i8>
   %res = bitcast <32 x i8> %call to <4 x i64>
   ret <4 x i64> %res
 }
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_packus_epi32:
@@ -1975,11 +1987,15 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
-  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
+  %1 = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %2 = icmp slt <16 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %4 = icmp sgt <16 x i32> %3, zeroinitializer
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
+  %call = trunc <16 x i32> %5 to <16 x i16>
   %res = bitcast <16 x i16> %call to <4 x i64>
   ret <4 x i64> %res
 }
-declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_permute2x128_si256:
Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
===================================================================
--- llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -4,6 +4,190 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X64 --check-prefix=X64-AVX512
 
+define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
+; X86-LABEL: test_x86_avx2_packssdw:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packssdw:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_packssdw_unary(<8 x i32> %a) {
+; X86-LABEL: test_x86_avx2_packssdw_unary:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackssdw %ymm0, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packssdw_unary:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackssdw %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %a) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+define <16 x i16> @test_x86_avx2_packssdw_fold() {
+; X86-LABEL: test_x86_avx2_packssdw_fold:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packssdw_fold:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
+  ret <16 x i16> %res
+}
+
+
+define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-LABEL: test_x86_avx2_packsswb:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packsswb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_packsswb_unary(<16 x i16> %a) {
+; X86-LABEL: test_x86_avx2_packsswb_unary:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpacksswb %ymm0, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packsswb_unary:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpacksswb %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %a) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+define <32 x i8> @test_x86_avx2_packsswb_fold() {
+; X86-LABEL: test_x86_avx2_packsswb_fold:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packsswb_fold:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
+  ret <32 x i8> %res
+}
+
+
+define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-LABEL: test_x86_avx2_packuswb:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packuswb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_packuswb_unary(<16 x i16> %a) {
+; X86-LABEL: test_x86_avx2_packuswb_unary:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackuswb %ymm0, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packuswb_unary:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackuswb %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %a) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+define <32 x i8> @test_x86_avx2_packuswb_fold() {
+; X86-LABEL: test_x86_avx2_packuswb_fold:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packuswb_fold:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
+  ret <32 x i8> %res
+}
+
+
+define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
+; X86-LABEL: test_x86_avx2_packusdw:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packusdw:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_packusdw_unary(<8 x i32> %a) {
+; X86-LABEL: test_x86_avx2_packusdw_unary:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packusdw_unary:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %a) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+define <16 x i16> @test_x86_avx2_packusdw_fold() {
+; X86-LABEL: test_x86_avx2_packusdw_fold:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx2_packusdw_fold:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X64-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
+  ret <16 x i16> %res
+}
+
+
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
 ; X86-LABEL: test_x86_avx2_pblendw:
 ; X86:       ## %bb.0:
Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
===================================================================
--- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -4,183 +4,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
 
-define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packssdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packssdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_packssdw_fold() {
-; X86-AVX-LABEL: test_x86_avx2_packssdw_fold:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
-; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packssdw_fold:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI1_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
-; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packssdw_fold:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
-; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packssdw_fold:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
-; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
-  ret <16 x i16> %res
-}
-
-
-define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packsswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packsswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
-
-
-define <32 x i8> @test_x86_avx2_packsswb_fold() {
-; X86-AVX-LABEL: test_x86_avx2_packsswb_fold:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI3_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packsswb_fold:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
-  ret <32 x i8> %res
-}
-
-
-define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packuswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packuswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
-
-
-define <32 x i8> @test_x86_avx2_packuswb_fold() {
-; X86-AVX-LABEL: test_x86_avx2_packuswb_fold:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI5_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packuswb_fold:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
-  ret <32 x i8> %res
-}
-
-
 define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-AVX-LABEL: test_x86_avx2_padds_b:
 ; X86-AVX:       ## %bb.0:
@@ -1299,65 +1122,6 @@
 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
 
 
-define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packusdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packusdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_packusdw_fold() {
-; X86-AVX-LABEL: test_x86_avx2_packusdw_fold:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
-  ret <16 x i16> %res
-}
-
-
 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; X86-LABEL: test_x86_avx2_pblendvb:
 ; X86:       ## %bb.0:
@@ -2071,36 +1835,36 @@
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI78_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI78_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI78_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
   ret <4 x i32> %res
@@ -2136,36 +1900,36 @@
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI80_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI80_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI80_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
   ret <8 x i32> %res
Index: llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1137,6 +1137,640 @@
   %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
   ret <32 x i16> %res
 }
+
+
+define <32 x i16> @test_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+; AVX512BW-LABEL: test_packs_epi32_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packs_epi32_rr_512_unary(<16 x i32> %a) {
+; AVX512BW-LABEL: test_packs_epi32_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackssdw %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackssdw %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %a)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; AVX512BW-LABEL: test_packs_epi32_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+; AVX512BW-LABEL: test_packs_epi32_rmb_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi32_rmb_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
+
+define <64 x i8> @test_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_packs_epi16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_packs_epi16_rr_512_unary(<32 x i16> %a) {
+; AVX512BW-LABEL: test_packs_epi16_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpacksswb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi16_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpacksswb %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %a)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+; AVX512BW-LABEL: test_epi16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_epi16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_packs_epi16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_packs_epi16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+; AVX512BW-LABEL: test_packs_epi16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packs_epi16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)
+
+define <32 x i16> @test_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+; AVX512BW-LABEL: test_packus_epi32_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packus_epi32_rr_512_unary(<16 x i32> %a) {
+; AVX512BW-LABEL: test_packus_epi32_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackusdw %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackusdw %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %a)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; AVX512BW-LABEL: test_packus_epi32_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+; AVX512BW-LABEL: test_packus_epi32_rmb_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rmb_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @test_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rmbk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rmbk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_packus_epi32_rmbkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi32_rmbkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
+}
+
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
+
+define <64 x i8> @test_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_packus_epi16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_packus_epi16_rr_512_unary(<32 x i16> %a) {
+; AVX512BW-LABEL: test_packus_epi16_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackuswb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackuswb %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %a)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_packus_epi16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+; AVX512BW-LABEL: test_packus_epi16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_packus_epi16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_packus_epi16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+; AVX512BW-LABEL: test_packus_epi16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_packus_epi16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
+
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
 ; AVX512BW:       ## %bb.0:
@@ -1151,6 +1785,20 @@
   ret <32 x i16> %res
 }
 
+define <32 x i16> @test_mask_packs_epi32_rr_512_unary(<16 x i32> %a) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackssdw %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackssdw %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %a, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
 ; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
 ; AVX512BW:       ## %bb.0:
@@ -1257,15 +1905,15 @@
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+define <32 x i16> @test_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rmbk_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32-LABEL: test_packs_epi32_rmbk_512:
 ; AVX512F-32:       # %bb.0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
@@ -1279,14 +1927,14 @@
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+define <32 x i16> @test_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_packs_epi32_rmbkz_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
 ;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32-LABEL: test_packs_epi32_rmbkz_512:
 ; AVX512F-32:       # %bb.0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
@@ -1315,15 +1963,29 @@
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+define <64 x i8> @test_mask_packs_epi16_rr_512_unary(<32 x i16> %a) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpacksswb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpacksswb %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %a, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_packs_epi16_rrk_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32-LABEL: test_packs_epi16_rrk_512:
 ; AVX512F-32:       # %bb.0:
 ; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
@@ -1420,6 +2082,20 @@
   ret <32 x i16> %res
 }
 
+define <32 x i16> @test_mask_packus_epi32_rr_512_unary(<16 x i32> %a) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackusdw %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackusdw %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %a, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
 ; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
 ; AVX512BW:       ## %bb.0:
@@ -1584,6 +2260,20 @@
   ret <64 x i8> %res
 }
 
+define <64 x i8> @test_mask_packus_epi16_rr_512_unary(<32 x i16> %a) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rr_512_unary:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpackuswb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512_unary:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpackuswb %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %a, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
 ; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
 ; AVX512BW:       ## %bb.0:
Index: llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2,583 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
-define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
-
-define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
-  ret <64 x i8> %3
-}
-
-declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)
-
-
-define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
-  ret <32 x i16> %3
-}
-
-declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
-
-define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
-  ret <64 x i8> %3
-}
-
-declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
-
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
 ; AVX512BW:       ## %bb.0:
Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -2109,6 +2109,836 @@
   %res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
   ret <16 x i16> %res
 }
+
+define <8 x i16> @test_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_packs_epi32_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packs_epi32_rr_128_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_packs_epi32_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %a)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_packs_epi32_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_packs_epi32_rmb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmbk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmbkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
+
+define <16 x i16> @test_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_packs_epi32_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packs_epi32_rr_256_unary(<8 x i32> %a) {
+; CHECK-LABEL: test_packs_epi32_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %a)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_packs_epi32_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_packs_epi32_rmb_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmbk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packs_epi32_rmbkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
+
+define <16 x i8> @test_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_packs_epi16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packs_epi16_rr_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_packs_epi16_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %a)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packs_epi16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_packs_epi16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_packs_epi16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packs_epi16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packs_epi16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
+
+define <32 x i8> @test_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_packs_epi16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packs_epi16_rr_256_unary(<16 x i16> %a) {
+; CHECK-LABEL: test_packs_epi16_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %a)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_packs_epi16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_packs_epi16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_packs_epi16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_packs_epi16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_packs_epi16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
+
+
+define <8 x i16> @test_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_packus_epi32_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packus_epi32_rr_128_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_packus_epi32_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %a)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_packus_epi32_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_packus_epi32_rmb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmbk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmbkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
+}
+
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
+
+define <16 x i16> @test_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_packus_epi32_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packus_epi32_rr_256_unary(<8 x i32> %a) {
+; CHECK-LABEL: test_packus_epi32_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %a)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_packus_epi32_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_packus_epi32_rmb_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @test_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmbk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packus_epi32_rmbkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
+}
+
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
+
+define <16 x i8> @test_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_packus_epi16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packus_epi16_rr_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_packus_epi16_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %a)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packus_epi16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_packus_epi16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_packus_epi16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_packus_epi16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_packus_epi16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+
+define <32 x i8> @test_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_packus_epi16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packus_epi16_rr_256_unary(<16 x i16> %a) {
+; CHECK-LABEL: test_packus_epi16_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %a)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_packus_epi16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_packus_epi16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_packus_epi16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_packus_epi16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_packus_epi16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
+
 define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_mask_packs_epi32_rr_128:
 ; CHECK:       ## %bb.0:
@@ -2118,6 +2948,15 @@
   ret <8 x i16> %res
 }
 
+define <8 x i16> @test_mask_packs_epi32_rr_128_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_mask_packs_epi32_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %a, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
 define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
 ; CHECK:       ## %bb.0:
@@ -2222,6 +3061,15 @@
   ret <16 x i16> %res
 }
 
+define <16 x i16> @test_mask_packs_epi32_rr_256_unary(<8 x i32> %a) {
+; CHECK-LABEL: test_mask_packs_epi32_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackssdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %a, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
 define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
 ; CHECK:       ## %bb.0:
@@ -2326,6 +3174,15 @@
   ret <16 x i8> %res
 }
 
+define <16 x i8> @test_mask_packs_epi16_rr_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_mask_packs_epi16_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %a, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
 define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
 ; CHECK:       ## %bb.0:
@@ -2391,6 +3248,15 @@
   ret <32 x i8> %res
 }
 
+define <32 x i8> @test_mask_packs_epi16_rr_256_unary(<16 x i16> %a) {
+; CHECK-LABEL: test_mask_packs_epi16_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpacksswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %a, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
 define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
 ; CHECK:       ## %bb.0:
@@ -2457,6 +3323,15 @@
   ret <8 x i16> %res
 }
 
+define <8 x i16> @test_mask_packus_epi32_rr_128_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_mask_packus_epi32_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %a, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
 define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
 ; CHECK:       ## %bb.0:
@@ -2561,6 +3436,15 @@
   ret <16 x i16> %res
 }
 
+define <16 x i16> @test_mask_packus_epi32_rr_256_unary(<8 x i32> %a) {
+; CHECK-LABEL: test_mask_packus_epi32_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %a, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
 define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
 ; CHECK:       ## %bb.0:
@@ -2665,6 +3549,15 @@
   ret <16 x i8> %res
 }
 
+define <16 x i8> @test_mask_packus_epi16_rr_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_mask_packus_epi16_rr_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %a, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
 define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
 ; CHECK:       ## %bb.0:
@@ -2730,6 +3623,15 @@
   ret <32 x i8> %res
 }
 
+define <32 x i8> @test_mask_packus_epi16_rr_256_unary(<16 x i16> %a) {
+; CHECK-LABEL: test_mask_packus_epi16_rr_256_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpackuswb %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %a, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
 define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
 ; CHECK:       ## %bb.0:
Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1,763 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
 
-define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_mask_packs_epi32_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi32_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
-
-define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_mask_packs_epi32_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi32_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
-
-define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_mask_packs_epi16_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi16_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
-  ret <16 x i8> %3
-}
-
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
-
-define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_mask_packs_epi16_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_packs_epi16_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
-; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
-  ret <32 x i8> %3
-}
-
-declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
-
-
-define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_mask_packus_epi32_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi32_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
-  ret <8 x i16> %3
-}
-
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
-
-define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_mask_packus_epi32_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi32_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load i32, i32* %ptr_b
-  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
-  ret <16 x i16> %3
-}
-
-declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
-
-define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_mask_packus_epi16_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi16_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
-  ret <16 x i8> %3
-}
-
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
-
-define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_mask_packus_epi16_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_packus_epi16_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
-; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
-  ret <32 x i8> %3
-}
-
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
-
 define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_mask_adds_epi16_rr_128:
 ; CHECK:       ## %bb.0:
Index: llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
===================================================================
--- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2002,11 +2002,15 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %1 = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = icmp slt <16 x i16> %1, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %4 = icmp sgt <16 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %res = trunc <16 x i16> %5 to <16 x i8>
   %bc = bitcast <16 x i8> %res to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_packs_epi32:
@@ -2020,11 +2024,15 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
-  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %1 = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = icmp slt <8 x i32> %1, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %4 = icmp sgt <8 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %res = trunc <8 x i32> %5 to <8 x i16>
   %bc = bitcast <8 x i16> %res to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_packus_epi16:
@@ -2038,11 +2046,15 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %1 = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = icmp slt <16 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %4 = icmp sgt <16 x i16> %3, zeroinitializer
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
+  %res = trunc <16 x i16> %5 to <16 x i8>
   %bc = bitcast <16 x i8> %res to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define void @test_mm_pause() nounwind {
 ; X32-LABEL: test_mm_pause:
Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
===================================================================
--- llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -226,6 +226,100 @@
 }
 declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
 
+
+define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_x86_sse2_packssdw_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packssdw %xmm1, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_packssdw_128_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_x86_sse2_packssdw_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packssdw %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %a) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
+define <8 x i16> @test_x86_sse2_packssdw_128_fold() {
+; CHECK-LABEL: test_x86_sse2_packssdw_128_fold:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768]
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
+  ret <8 x i16> %res
+}
+
+
+define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_x86_sse2_packsswb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packsswb %xmm1, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packsswb_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_packsswb_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packsswb %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %a) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
+; CHECK-LABEL: test_x86_sse2_packsswb_128_fold:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
+  ret <16 x i8> %res
+}
+
+
+define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_x86_sse2_packuswb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packuswb %xmm1, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packuswb_128_unary(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_packuswb_128_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %a) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
+; CHECK-LABEL: test_x86_sse2_packuswb_128_fold:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
+  ret <16 x i8> %res
+}
+
+
 define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: mm_avg_epu8:
 ; CHECK:       ## %bb.0:
Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
===================================================================
--- llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -741,147 +741,6 @@
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 
 
-define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; SSE-LABEL: test_x86_sse2_packssdw_128:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    packssdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6b,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packssdw_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packssdw_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_packssdw_128_fold() {
-; SSE-LABEL: test_x86_sse2_packssdw_128_fold:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768]
-; SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; SSE-NEXT:    ## fixup A - offset: 3, value: LCPI35_0, kind: FK_Data_4
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packssdw_128_fold:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packssdw_128_fold:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovaps LCPI35_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768]
-; SKX-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
-  ret <8 x i16> %res
-}
-
-
-define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_x86_sse2_packsswb_128:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    packsswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x63,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packsswb_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packsswb_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
-; SSE-LABEL: test_x86_sse2_packsswb_128_fold:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; SSE-NEXT:    ## fixup A - offset: 3, value: LCPI37_0, kind: FK_Data_4
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packsswb_128_fold:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packsswb_128_fold:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovaps LCPI37_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; SKX-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
-  ret <16 x i8> %res
-}
-
-
-define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_x86_sse2_packuswb_128:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    packuswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packuswb_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x67,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packuswb_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
-; SSE-LABEL: test_x86_sse2_packuswb_128_fold:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; SSE-NEXT:    ## fixup A - offset: 3, value: LCPI39_0, kind: FK_Data_4
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_packuswb_128_fold:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_packuswb_128_fold:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovaps LCPI39_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; SKX-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
-  ret <16 x i8> %res
-}
-
-
 define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: test_x86_sse2_padds_b:
 ; SSE:       ## %bb.0:
Index: llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
===================================================================
--- llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -847,11 +847,15 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
-  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
+  %1 = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = icmp slt <8 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %4 = icmp sgt <8 x i32> %3, zeroinitializer
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
+  %res = trunc <8 x i32> %5 to <8 x i16>
   %bc = bitcast <8 x i16> %res to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
 ; X32-LABEL: test_mm_round_pd:
Index: llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
===================================================================
--- llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -83,6 +83,37 @@
 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
 
 
+define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_x86_sse41_packusdw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packusdw %xmm1, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_packusdw_unary(<4 x i32> %a) {
+; CHECK-LABEL: test_x86_sse41_packusdw_unary:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    packusdw %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %a) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
+define <8 x i16> @test_x86_sse41_packusdw_fold() {
+; CHECK-LABEL: test_x86_sse41_packusdw_fold:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
+  ret <8 x i16> %res
+}
+
+
 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pblendw:
 ; CHECK:       ## %bb.0:
Index: llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
===================================================================
--- llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -114,53 +114,6 @@
 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
-define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
-; SSE41-LABEL: test_x86_sse41_packusdw:
-; SSE41:       ## %bb.0:
-; SSE41-NEXT:    packusdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x2b,0xc1]
-; SSE41-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse41_packusdw:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse41_packusdw:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_packusdw_fold() {
-; SSE41-LABEL: test_x86_sse41_packusdw_fold:
-; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
-; SSE41-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; SSE41-NEXT:    ## fixup A - offset: 3, value: LCPI7_0, kind: FK_Data_4
-; SSE41-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse41_packusdw_fold:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse41_packusdw_fold:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0]
-; SKX-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
-  ret <8 x i16> %res
-}
-
-
 define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; SSE41-LABEL: test_x86_sse41_pblendvb:
 ; SSE41:       ## %bb.0:
Index: llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
===================================================================
--- llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -4,42 +4,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
 declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
 
-define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
-entry:
-  %c = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) nounwind
-  ret <8 x i16> %c
-}
-
-; CHECK-LABEL: @Test_packssdw_128(
-; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
-; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
-; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
-; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
-; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
-; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
-; CHECK: ret <8 x i16>
-
-
-define <32 x i8> @Test_avx_packuswb(<16 x i16> %a, <16 x i16> %b) sanitize_memory {
-entry:
-  %c = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind
-  ret <32 x i8> %c
-}
-
-; CHECK-LABEL: @Test_avx_packuswb(
-; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
-; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
-; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
-; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
-; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packsswb(
-; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packuswb(
-; CHECK: ret <32 x i8>
-
-
 define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
 entry:
   %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
Index: llvm/test/Transforms/InstCombine/X86/x86-pack.ll
===================================================================
--- llvm/test/Transforms/InstCombine/X86/x86-pack.ll
+++ /dev/null
@@ -1,366 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-;
-; UNDEF Elts
-;
-
-define <8 x i16> @undef_packssdw_128() {
-; CHECK-LABEL: @undef_packssdw_128(
-; CHECK-NEXT:    ret <8 x i16> undef
-;
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @undef_packusdw_128() {
-; CHECK-LABEL: @undef_packusdw_128(
-; CHECK-NEXT:    ret <8 x i16> undef
-;
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
-  ret <8 x i16> %1
-}
-
-define <16 x i8> @undef_packsswb_128() {
-; CHECK-LABEL: @undef_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @undef_packuswb_128() {
-; CHECK-LABEL: @undef_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i16> @undef_packssdw_256() {
-; CHECK-LABEL: @undef_packssdw_256(
-; CHECK-NEXT:    ret <16 x i16> undef
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @undef_packusdw_256() {
-; CHECK-LABEL: @undef_packusdw_256(
-; CHECK-NEXT:    ret <16 x i16> undef
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <32 x i8> @undef_packsswb_256() {
-; CHECK-LABEL: @undef_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @undef_packuswb_256() {
-; CHECK-LABEL: @undef_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
-  ret <32 x i8> %1
-}
-
-define <32 x i16> @undef_packssdw_512() {
-; CHECK-LABEL: @undef_packssdw_512(
-; CHECK-NEXT:    ret <32 x i16> undef
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @undef_packusdw_512() {
-; CHECK-LABEL: @undef_packusdw_512(
-; CHECK-NEXT:    ret <32 x i16> undef
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <64 x i8> @undef_packsswb_512() {
-; CHECK-LABEL: @undef_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @undef_packuswb_512() {
-; CHECK-LABEL: @undef_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
-  ret <64 x i8> %1
-}
-
-;
-; Constant Folding
-;
-
-define <8 x i16> @fold_packssdw_128() {
-; CHECK-LABEL: @fold_packssdw_128(
-; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
-;
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @fold_packusdw_128() {
-; CHECK-LABEL: @fold_packusdw_128(
-; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
-;
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
-  ret <8 x i16> %1
-}
-
-define <16 x i8> @fold_packsswb_128() {
-; CHECK-LABEL: @fold_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @fold_packuswb_128() {
-; CHECK-LABEL: @fold_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i16> @fold_packssdw_256() {
-; CHECK-LABEL: @fold_packssdw_256(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @fold_packusdw_256() {
-; CHECK-LABEL: @fold_packusdw_256(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
-  ret <16 x i16> %1
-}
-
-define <32 x i8> @fold_packsswb_256() {
-; CHECK-LABEL: @fold_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @fold_packuswb_256() {
-; CHECK-LABEL: @fold_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
-  ret <32 x i8> %1
-}
-
-define <32 x i16> @fold_packssdw_512() {
-; CHECK-LABEL: @fold_packssdw_512(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @fold_packusdw_512() {
-; CHECK-LABEL: @fold_packusdw_512(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
-  ret <32 x i16> %1
-}
-
-define <64 x i8> @fold_packsswb_512() {
-; CHECK-LABEL: @fold_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @fold_packuswb_512() {
-; CHECK-LABEL: @fold_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
-  ret <64 x i8> %1
-}
-
-;
-; Demanded Elts
-;
-
-define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
-  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
-  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
-  ret <8 x i16> %4
-}
-
-define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
-  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
-  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
-  ret <8 x i16> %4
-}
-
-define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
-  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-  ret <16 x i8> %4
-}
-
-define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = insertelement <8 x i16> undef, i16 0, i32 0
-  %2 = insertelement <8 x i16> undef, i16 0, i32 0
-  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-  ret <16 x i8> %4
-}
-
-define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
-  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
-  ret <16 x i16> %4
-}
-
-define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i16> %4
-}
-
-define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> zeroinitializer
-;
-  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
-  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
-  ret <32 x i8> %4
-}
-
-define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = insertelement <16 x i16> undef, i16 0, i32 1
-  %2 = insertelement <16 x i16> undef, i16 0, i32 0
-  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
-  ret <32 x i8> %4
-}
-
-define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
-  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
-  ret <32 x i16> %4
-}
-
-define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <32 x i16> %4
-}
-
-define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> zeroinitializer
-;
-  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
-  %3 = insertelement <32 x i16> %1, i16 0, i32 16
-  %4 = insertelement <32 x i16> %2, i16 0, i32 24
-  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
-  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
-  ret <64 x i8> %6
-}
-
-define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = insertelement <32 x i16> undef, i16 0, i32 1
-  %2 = insertelement <32 x i16> undef, i16 0, i32 0
-  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
-  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
-  ret <64 x i8> %4
-}
-
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
-declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
-declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
-
-declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
-declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone